aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2013-08-12 03:43:45 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2013-08-12 03:43:45 -0400
commitcada23f308e3869ceb5c75f164d249448dfaec07 (patch)
tree97c7aebcad0eb2a93a7519251a01f5be9255ee75 /drivers/md
parente769ece3b129698d2b09811a6f6d304e4eaa8c29 (diff)
parent6c8c0c4dc0e98ee2191211d66e9f876e95787073 (diff)
Merge branch 'kvm-arm64/fixes-3.11-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms into kvm-master
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c46
-rw-r--r--drivers/md/bcache/bcache.h61
-rw-r--r--drivers/md/bcache/bset.c56
-rw-r--r--drivers/md/bcache/bset.h4
-rw-r--r--drivers/md/bcache/btree.c451
-rw-r--r--drivers/md/bcache/btree.h35
-rw-r--r--drivers/md/bcache/closure.c6
-rw-r--r--drivers/md/bcache/debug.c178
-rw-r--r--drivers/md/bcache/debug.h11
-rw-r--r--drivers/md/bcache/io.c68
-rw-r--r--drivers/md/bcache/journal.c25
-rw-r--r--drivers/md/bcache/movinggc.c24
-rw-r--r--drivers/md/bcache/request.c197
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c171
-rw-r--r--drivers/md/bcache/sysfs.c68
-rw-r--r--drivers/md/bcache/trace.c47
-rw-r--r--drivers/md/bcache/util.c17
-rw-r--r--drivers/md/bcache/util.h6
-rw-r--r--drivers/md/bcache/writeback.c133
-rw-r--r--drivers/md/bcache/writeback.h64
-rw-r--r--drivers/md/raid10.c8
-rw-r--r--drivers/md/raid5.c15
-rw-r--r--drivers/md/raid5.h1
24 files changed, 887 insertions, 807 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 048f2947e08b..e45f5575fd4d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,7 +63,10 @@
63#include "bcache.h" 63#include "bcache.h"
64#include "btree.h" 64#include "btree.h"
65 65
66#include <linux/freezer.h>
67#include <linux/kthread.h>
66#include <linux/random.h> 68#include <linux/random.h>
69#include <trace/events/bcache.h>
67 70
68#define MAX_IN_FLIGHT_DISCARDS 8U 71#define MAX_IN_FLIGHT_DISCARDS 8U
69 72
@@ -151,7 +154,7 @@ static void discard_finish(struct work_struct *w)
151 mutex_unlock(&ca->set->bucket_lock); 154 mutex_unlock(&ca->set->bucket_lock);
152 155
153 closure_wake_up(&ca->set->bucket_wait); 156 closure_wake_up(&ca->set->bucket_wait);
154 wake_up(&ca->set->alloc_wait); 157 wake_up_process(ca->alloc_thread);
155 158
156 closure_put(&ca->set->cl); 159 closure_put(&ca->set->cl);
157} 160}
@@ -350,38 +353,30 @@ static void invalidate_buckets(struct cache *ca)
350 break; 353 break;
351 } 354 }
352 355
353 pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", 356 trace_bcache_alloc_invalidate(ca);
354 fifo_used(&ca->free), ca->free.size,
355 fifo_used(&ca->free_inc), ca->free_inc.size,
356 fifo_used(&ca->unused), ca->unused.size);
357} 357}
358 358
359#define allocator_wait(ca, cond) \ 359#define allocator_wait(ca, cond) \
360do { \ 360do { \
361 DEFINE_WAIT(__wait); \
362 \
363 while (1) { \ 361 while (1) { \
364 prepare_to_wait(&ca->set->alloc_wait, \ 362 set_current_state(TASK_INTERRUPTIBLE); \
365 &__wait, TASK_INTERRUPTIBLE); \
366 if (cond) \ 363 if (cond) \
367 break; \ 364 break; \
368 \ 365 \
369 mutex_unlock(&(ca)->set->bucket_lock); \ 366 mutex_unlock(&(ca)->set->bucket_lock); \
370 if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ 367 if (kthread_should_stop()) \
371 finish_wait(&ca->set->alloc_wait, &__wait); \ 368 return 0; \
372 closure_return(cl); \
373 } \
374 \ 369 \
370 try_to_freeze(); \
375 schedule(); \ 371 schedule(); \
376 mutex_lock(&(ca)->set->bucket_lock); \ 372 mutex_lock(&(ca)->set->bucket_lock); \
377 } \ 373 } \
378 \ 374 __set_current_state(TASK_RUNNING); \
379 finish_wait(&ca->set->alloc_wait, &__wait); \
380} while (0) 375} while (0)
381 376
382void bch_allocator_thread(struct closure *cl) 377static int bch_allocator_thread(void *arg)
383{ 378{
384 struct cache *ca = container_of(cl, struct cache, alloc); 379 struct cache *ca = arg;
385 380
386 mutex_lock(&ca->set->bucket_lock); 381 mutex_lock(&ca->set->bucket_lock);
387 382
@@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
442{ 437{
443 long r = -1; 438 long r = -1;
444again: 439again:
445 wake_up(&ca->set->alloc_wait); 440 wake_up_process(ca->alloc_thread);
446 441
447 if (fifo_used(&ca->free) > ca->watermark[watermark] && 442 if (fifo_used(&ca->free) > ca->watermark[watermark] &&
448 fifo_pop(&ca->free, r)) { 443 fifo_pop(&ca->free, r)) {
@@ -476,9 +471,7 @@ again:
476 return r; 471 return r;
477 } 472 }
478 473
479 pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", 474 trace_bcache_alloc_fail(ca);
480 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
481 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
482 475
483 if (cl) { 476 if (cl) {
484 closure_wait(&ca->set->bucket_wait, cl); 477 closure_wait(&ca->set->bucket_wait, cl);
@@ -552,6 +545,17 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
552 545
553/* Init */ 546/* Init */
554 547
548int bch_cache_allocator_start(struct cache *ca)
549{
550 struct task_struct *k = kthread_run(bch_allocator_thread,
551 ca, "bcache_allocator");
552 if (IS_ERR(k))
553 return PTR_ERR(k);
554
555 ca->alloc_thread = k;
556 return 0;
557}
558
555void bch_cache_allocator_exit(struct cache *ca) 559void bch_cache_allocator_exit(struct cache *ca)
556{ 560{
557 struct discard *d; 561 struct discard *d;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d3e15b42a4ab..b39f6f0b45f2 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -178,7 +178,6 @@
178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ 178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
179 179
180#include <linux/bio.h> 180#include <linux/bio.h>
181#include <linux/blktrace_api.h>
182#include <linux/kobject.h> 181#include <linux/kobject.h>
183#include <linux/list.h> 182#include <linux/list.h>
184#include <linux/mutex.h> 183#include <linux/mutex.h>
@@ -388,8 +387,6 @@ struct keybuf_key {
388typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); 387typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
389 388
390struct keybuf { 389struct keybuf {
391 keybuf_pred_fn *key_predicate;
392
393 struct bkey last_scanned; 390 struct bkey last_scanned;
394 spinlock_t lock; 391 spinlock_t lock;
395 392
@@ -437,9 +434,12 @@ struct bcache_device {
437 434
438 /* If nonzero, we're detaching/unregistering from cache set */ 435 /* If nonzero, we're detaching/unregistering from cache set */
439 atomic_t detaching; 436 atomic_t detaching;
437 int flush_done;
438
439 uint64_t nr_stripes;
440 unsigned stripe_size_bits;
441 atomic_t *stripe_sectors_dirty;
440 442
441 atomic_long_t sectors_dirty;
442 unsigned long sectors_dirty_gc;
443 unsigned long sectors_dirty_last; 443 unsigned long sectors_dirty_last;
444 long sectors_dirty_derivative; 444 long sectors_dirty_derivative;
445 445
@@ -531,6 +531,7 @@ struct cached_dev {
531 unsigned sequential_merge:1; 531 unsigned sequential_merge:1;
532 unsigned verify:1; 532 unsigned verify:1;
533 533
534 unsigned partial_stripes_expensive:1;
534 unsigned writeback_metadata:1; 535 unsigned writeback_metadata:1;
535 unsigned writeback_running:1; 536 unsigned writeback_running:1;
536 unsigned char writeback_percent; 537 unsigned char writeback_percent;
@@ -565,8 +566,7 @@ struct cache {
565 566
566 unsigned watermark[WATERMARK_MAX]; 567 unsigned watermark[WATERMARK_MAX];
567 568
568 struct closure alloc; 569 struct task_struct *alloc_thread;
569 struct workqueue_struct *alloc_workqueue;
570 570
571 struct closure prio; 571 struct closure prio;
572 struct prio_set *disk_buckets; 572 struct prio_set *disk_buckets;
@@ -664,13 +664,9 @@ struct gc_stat {
664 * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; 664 * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
665 * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. 665 * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
666 * flushing dirty data). 666 * flushing dirty data).
667 *
668 * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down
669 * the allocation thread.
670 */ 667 */
671#define CACHE_SET_UNREGISTERING 0 668#define CACHE_SET_UNREGISTERING 0
672#define CACHE_SET_STOPPING 1 669#define CACHE_SET_STOPPING 1
673#define CACHE_SET_STOPPING_2 2
674 670
675struct cache_set { 671struct cache_set {
676 struct closure cl; 672 struct closure cl;
@@ -703,9 +699,6 @@ struct cache_set {
703 /* For the btree cache */ 699 /* For the btree cache */
704 struct shrinker shrink; 700 struct shrinker shrink;
705 701
706 /* For the allocator itself */
707 wait_queue_head_t alloc_wait;
708
709 /* For the btree cache and anything allocation related */ 702 /* For the btree cache and anything allocation related */
710 struct mutex bucket_lock; 703 struct mutex bucket_lock;
711 704
@@ -823,10 +816,9 @@ struct cache_set {
823 816
824 /* 817 /*
825 * A btree node on disk could have too many bsets for an iterator to fit 818 * A btree node on disk could have too many bsets for an iterator to fit
826 * on the stack - this is a single element mempool for btree_read_work() 819 * on the stack - have to dynamically allocate them
827 */ 820 */
828 struct mutex fill_lock; 821 mempool_t *fill_iter;
829 struct btree_iter *fill_iter;
830 822
831 /* 823 /*
832 * btree_sort() is a merge sort and requires temporary space - single 824 * btree_sort() is a merge sort and requires temporary space - single
@@ -834,6 +826,7 @@ struct cache_set {
834 */ 826 */
835 struct mutex sort_lock; 827 struct mutex sort_lock;
836 struct bset *sort; 828 struct bset *sort;
829 unsigned sort_crit_factor;
837 830
838 /* List of buckets we're currently writing data to */ 831 /* List of buckets we're currently writing data to */
839 struct list_head data_buckets; 832 struct list_head data_buckets;
@@ -906,8 +899,6 @@ static inline unsigned local_clock_us(void)
906 return local_clock() >> 10; 899 return local_clock() >> 10;
907} 900}
908 901
909#define MAX_BSETS 4U
910
911#define BTREE_PRIO USHRT_MAX 902#define BTREE_PRIO USHRT_MAX
912#define INITIAL_PRIO 32768 903#define INITIAL_PRIO 32768
913 904
@@ -1112,23 +1103,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
1112 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 1103 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
1113} 1104}
1114 1105
1115/* Blktrace macros */
1116
1117#define blktrace_msg(c, fmt, ...) \
1118do { \
1119 struct request_queue *q = bdev_get_queue(c->bdev); \
1120 if (q) \
1121 blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
1122} while (0)
1123
1124#define blktrace_msg_all(s, fmt, ...) \
1125do { \
1126 struct cache *_c; \
1127 unsigned i; \
1128 for_each_cache(_c, (s), i) \
1129 blktrace_msg(_c, fmt, ##__VA_ARGS__); \
1130} while (0)
1131
1132static inline void cached_dev_put(struct cached_dev *dc) 1106static inline void cached_dev_put(struct cached_dev *dc)
1133{ 1107{
1134 if (atomic_dec_and_test(&dc->count)) 1108 if (atomic_dec_and_test(&dc->count))
@@ -1173,10 +1147,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
1173 static struct kobj_attribute ksysfs_##n = \ 1147 static struct kobj_attribute ksysfs_##n = \
1174 __ATTR(n, S_IWUSR|S_IRUSR, show, store) 1148 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1175 1149
1176/* Forward declarations */ 1150static inline void wake_up_allocators(struct cache_set *c)
1151{
1152 struct cache *ca;
1153 unsigned i;
1154
1155 for_each_cache(ca, c, i)
1156 wake_up_process(ca->alloc_thread);
1157}
1177 1158
1178void bch_writeback_queue(struct cached_dev *); 1159/* Forward declarations */
1179void bch_writeback_add(struct cached_dev *, unsigned);
1180 1160
1181void bch_count_io_errors(struct cache *, int, const char *); 1161void bch_count_io_errors(struct cache *, int, const char *);
1182void bch_bbio_count_io_errors(struct cache_set *, struct bio *, 1162void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
@@ -1193,7 +1173,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
1193uint8_t bch_inc_gen(struct cache *, struct bucket *); 1173uint8_t bch_inc_gen(struct cache *, struct bucket *);
1194void bch_rescale_priorities(struct cache_set *, int); 1174void bch_rescale_priorities(struct cache_set *, int);
1195bool bch_bucket_add_unused(struct cache *, struct bucket *); 1175bool bch_bucket_add_unused(struct cache *, struct bucket *);
1196void bch_allocator_thread(struct closure *);
1197 1176
1198long bch_bucket_alloc(struct cache *, unsigned, struct closure *); 1177long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
1199void bch_bucket_free(struct cache_set *, struct bkey *); 1178void bch_bucket_free(struct cache_set *, struct bkey *);
@@ -1241,9 +1220,9 @@ void bch_cache_set_stop(struct cache_set *);
1241struct cache_set *bch_cache_set_alloc(struct cache_sb *); 1220struct cache_set *bch_cache_set_alloc(struct cache_sb *);
1242void bch_btree_cache_free(struct cache_set *); 1221void bch_btree_cache_free(struct cache_set *);
1243int bch_btree_cache_alloc(struct cache_set *); 1222int bch_btree_cache_alloc(struct cache_set *);
1244void bch_cached_dev_writeback_init(struct cached_dev *);
1245void bch_moving_init_cache_set(struct cache_set *); 1223void bch_moving_init_cache_set(struct cache_set *);
1246 1224
1225int bch_cache_allocator_start(struct cache *ca);
1247void bch_cache_allocator_exit(struct cache *ca); 1226void bch_cache_allocator_exit(struct cache *ca);
1248int bch_cache_allocator_init(struct cache *ca); 1227int bch_cache_allocator_init(struct cache *ca);
1249 1228
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 1d27d3af3251..8010eed06a51 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) 78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
79{ 79{
80 unsigned i; 80 unsigned i;
81 char buf[80];
81 82
82 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) 83 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
83 goto bad; 84 goto bad;
@@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
102 103
103 return false; 104 return false;
104bad: 105bad:
105 cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); 106 bch_bkey_to_text(buf, sizeof(buf), k);
107 cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
106 return true; 108 return true;
107} 109}
108 110
@@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
162#ifdef CONFIG_BCACHE_EDEBUG 164#ifdef CONFIG_BCACHE_EDEBUG
163bug: 165bug:
164 mutex_unlock(&b->c->bucket_lock); 166 mutex_unlock(&b->c->bucket_lock);
165 btree_bug(b, 167
168 {
169 char buf[80];
170
171 bch_bkey_to_text(buf, sizeof(buf), k);
172 btree_bug(b,
166"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 173"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
167 pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), 174 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
168 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 175 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
176 }
169 return true; 177 return true;
170#endif 178#endif
171} 179}
@@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
1084 new->sets->size = 0; 1092 new->sets->size = 0;
1085} 1093}
1086 1094
1095#define SORT_CRIT (4096 / sizeof(uint64_t))
1096
1087void bch_btree_sort_lazy(struct btree *b) 1097void bch_btree_sort_lazy(struct btree *b)
1088{ 1098{
1089 if (b->nsets) { 1099 unsigned crit = SORT_CRIT;
1090 unsigned i, j, keys = 0, total; 1100 int i;
1091 1101
1092 for (i = 0; i <= b->nsets; i++) 1102 /* Don't sort if nothing to do */
1093 keys += b->sets[i].data->keys; 1103 if (!b->nsets)
1094 1104 goto out;
1095 total = keys;
1096 1105
1097 for (j = 0; j < b->nsets; j++) { 1106 /* If not a leaf node, always sort */
1098 if (keys * 2 < total || 1107 if (b->level) {
1099 keys < 1000) { 1108 bch_btree_sort(b);
1100 bch_btree_sort_partial(b, j); 1109 return;
1101 return; 1110 }
1102 }
1103 1111
1104 keys -= b->sets[j].data->keys; 1112 for (i = b->nsets - 1; i >= 0; --i) {
1105 } 1113 crit *= b->c->sort_crit_factor;
1106 1114
1107 /* Must sort if b->nsets == 3 or we'll overflow */ 1115 if (b->sets[i].data->keys < crit) {
1108 if (b->nsets >= (MAX_BSETS - 1) - b->level) { 1116 bch_btree_sort_partial(b, i);
1109 bch_btree_sort(b);
1110 return; 1117 return;
1111 } 1118 }
1112 } 1119 }
1113 1120
1121 /* Sort if we'd overflow */
1122 if (b->nsets + 1 == MAX_BSETS) {
1123 bch_btree_sort(b);
1124 return;
1125 }
1126
1127out:
1114 bset_build_written_tree(b); 1128 bset_build_written_tree(b);
1115} 1129}
1116 1130
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 57a9cff41546..ae115a253d73 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,6 +1,8 @@
1#ifndef _BCACHE_BSET_H 1#ifndef _BCACHE_BSET_H
2#define _BCACHE_BSET_H 2#define _BCACHE_BSET_H
3 3
4#include <linux/slab.h>
5
4/* 6/*
5 * BKEYS: 7 * BKEYS:
6 * 8 *
@@ -142,6 +144,8 @@
142 144
143/* Btree key comparison/iteration */ 145/* Btree key comparison/iteration */
144 146
147#define MAX_BSETS 4U
148
145struct btree_iter { 149struct btree_iter {
146 size_t size, used; 150 size_t size, used;
147 struct btree_iter_set { 151 struct btree_iter_set {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a5658f04e62..ee372884c405 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -24,6 +24,7 @@
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "request.h" 26#include "request.h"
27#include "writeback.h"
27 28
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/bitops.h> 30#include <linux/bitops.h>
@@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
134 return crc ^ 0xffffffffffffffffULL; 135 return crc ^ 0xffffffffffffffffULL;
135} 136}
136 137
137static void btree_bio_endio(struct bio *bio, int error) 138static void bch_btree_node_read_done(struct btree *b)
138{ 139{
139 struct closure *cl = bio->bi_private;
140 struct btree *b = container_of(cl, struct btree, io.cl);
141
142 if (error)
143 set_btree_node_io_error(b);
144
145 bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
146 ? "writing btree" : "reading btree");
147 closure_put(cl);
148}
149
150static void btree_bio_init(struct btree *b)
151{
152 BUG_ON(b->bio);
153 b->bio = bch_bbio_alloc(b->c);
154
155 b->bio->bi_end_io = btree_bio_endio;
156 b->bio->bi_private = &b->io.cl;
157}
158
159void bch_btree_read_done(struct closure *cl)
160{
161 struct btree *b = container_of(cl, struct btree, io.cl);
162 struct bset *i = b->sets[0].data;
163 struct btree_iter *iter = b->c->fill_iter;
164 const char *err = "bad btree header"; 140 const char *err = "bad btree header";
165 BUG_ON(b->nsets || b->written); 141 struct bset *i = b->sets[0].data;
166 142 struct btree_iter *iter;
167 bch_bbio_free(b->bio, b->c);
168 b->bio = NULL;
169 143
170 mutex_lock(&b->c->fill_lock); 144 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
171 iter->used = 0; 146 iter->used = 0;
172 147
173 if (btree_node_io_error(b) || 148 if (!i->seq)
174 !i->seq)
175 goto err; 149 goto err;
176 150
177 for (; 151 for (;
@@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl)
228 if (b->written < btree_blocks(b)) 202 if (b->written < btree_blocks(b))
229 bch_bset_init_next(b); 203 bch_bset_init_next(b);
230out: 204out:
231 205 mempool_free(iter, b->c->fill_iter);
232 mutex_unlock(&b->c->fill_lock); 206 return;
233
234 spin_lock(&b->c->btree_read_time_lock);
235 bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
236 spin_unlock(&b->c->btree_read_time_lock);
237
238 smp_wmb(); /* read_done is our write lock */
239 set_btree_node_read_done(b);
240
241 closure_return(cl);
242err: 207err:
243 set_btree_node_io_error(b); 208 set_btree_node_io_error(b);
244 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 209 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@@ -247,48 +212,69 @@ err:
247 goto out; 212 goto out;
248} 213}
249 214
250void bch_btree_read(struct btree *b) 215static void btree_node_read_endio(struct bio *bio, int error)
216{
217 struct closure *cl = bio->bi_private;
218 closure_put(cl);
219}
220
221void bch_btree_node_read(struct btree *b)
251{ 222{
252 BUG_ON(b->nsets || b->written); 223 uint64_t start_time = local_clock();
224 struct closure cl;
225 struct bio *bio;
226
227 trace_bcache_btree_read(b);
228
229 closure_init_stack(&cl);
230
231 bio = bch_bbio_alloc(b->c);
232 bio->bi_rw = REQ_META|READ_SYNC;
233 bio->bi_size = KEY_SIZE(&b->key) << 9;
234 bio->bi_end_io = btree_node_read_endio;
235 bio->bi_private = &cl;
236
237 bch_bio_map(bio, b->sets[0].data);
238
239 bch_submit_bbio(bio, b->c, &b->key, 0);
240 closure_sync(&cl);
253 241
254 if (!closure_trylock(&b->io.cl, &b->c->cl)) 242 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
255 BUG(); 243 set_btree_node_io_error(b);
256 244
257 b->io_start_time = local_clock(); 245 bch_bbio_free(bio, b->c);
258 246
259 btree_bio_init(b); 247 if (btree_node_io_error(b))
260 b->bio->bi_rw = REQ_META|READ_SYNC; 248 goto err;
261 b->bio->bi_size = KEY_SIZE(&b->key) << 9;
262 249
263 bch_bio_map(b->bio, b->sets[0].data); 250 bch_btree_node_read_done(b);
264 251
265 pr_debug("%s", pbtree(b)); 252 spin_lock(&b->c->btree_read_time_lock);
266 trace_bcache_btree_read(b->bio); 253 bch_time_stats_update(&b->c->btree_read_time, start_time);
267 bch_submit_bbio(b->bio, b->c, &b->key, 0); 254 spin_unlock(&b->c->btree_read_time_lock);
268 255
269 continue_at(&b->io.cl, bch_btree_read_done, system_wq); 256 return;
257err:
258 bch_cache_set_error(b->c, "io error reading bucket %lu",
259 PTR_BUCKET_NR(b->c, &b->key, 0));
270} 260}
271 261
272static void btree_complete_write(struct btree *b, struct btree_write *w) 262static void btree_complete_write(struct btree *b, struct btree_write *w)
273{ 263{
274 if (w->prio_blocked && 264 if (w->prio_blocked &&
275 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 265 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
276 wake_up(&b->c->alloc_wait); 266 wake_up_allocators(b->c);
277 267
278 if (w->journal) { 268 if (w->journal) {
279 atomic_dec_bug(w->journal); 269 atomic_dec_bug(w->journal);
280 __closure_wake_up(&b->c->journal.wait); 270 __closure_wake_up(&b->c->journal.wait);
281 } 271 }
282 272
283 if (w->owner)
284 closure_put(w->owner);
285
286 w->prio_blocked = 0; 273 w->prio_blocked = 0;
287 w->journal = NULL; 274 w->journal = NULL;
288 w->owner = NULL;
289} 275}
290 276
291static void __btree_write_done(struct closure *cl) 277static void __btree_node_write_done(struct closure *cl)
292{ 278{
293 struct btree *b = container_of(cl, struct btree, io.cl); 279 struct btree *b = container_of(cl, struct btree, io.cl);
294 struct btree_write *w = btree_prev_write(b); 280 struct btree_write *w = btree_prev_write(b);
@@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl)
304 closure_return(cl); 290 closure_return(cl);
305} 291}
306 292
307static void btree_write_done(struct closure *cl) 293static void btree_node_write_done(struct closure *cl)
308{ 294{
309 struct btree *b = container_of(cl, struct btree, io.cl); 295 struct btree *b = container_of(cl, struct btree, io.cl);
310 struct bio_vec *bv; 296 struct bio_vec *bv;
@@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl)
313 __bio_for_each_segment(bv, b->bio, n, 0) 299 __bio_for_each_segment(bv, b->bio, n, 0)
314 __free_page(bv->bv_page); 300 __free_page(bv->bv_page);
315 301
316 __btree_write_done(cl); 302 __btree_node_write_done(cl);
317} 303}
318 304
319static void do_btree_write(struct btree *b) 305static void btree_node_write_endio(struct bio *bio, int error)
306{
307 struct closure *cl = bio->bi_private;
308 struct btree *b = container_of(cl, struct btree, io.cl);
309
310 if (error)
311 set_btree_node_io_error(b);
312
313 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
314 closure_put(cl);
315}
316
317static void do_btree_node_write(struct btree *b)
320{ 318{
321 struct closure *cl = &b->io.cl; 319 struct closure *cl = &b->io.cl;
322 struct bset *i = b->sets[b->nsets].data; 320 struct bset *i = b->sets[b->nsets].data;
@@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b)
325 i->version = BCACHE_BSET_VERSION; 323 i->version = BCACHE_BSET_VERSION;
326 i->csum = btree_csum_set(b, i); 324 i->csum = btree_csum_set(b, i);
327 325
328 btree_bio_init(b); 326 BUG_ON(b->bio);
329 b->bio->bi_rw = REQ_META|WRITE_SYNC; 327 b->bio = bch_bbio_alloc(b->c);
330 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 328
329 b->bio->bi_end_io = btree_node_write_endio;
330 b->bio->bi_private = &b->io.cl;
331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
331 bch_bio_map(b->bio, i); 333 bch_bio_map(b->bio, i);
332 334
335 /*
336 * If we're appending to a leaf node, we don't technically need FUA -
337 * this write just needs to be persisted before the next journal write,
338 * which will be marked FLUSH|FUA.
339 *
340 * Similarly if we're writing a new btree root - the pointer is going to
341 * be in the next journal entry.
342 *
343 * But if we're writing a new btree node (that isn't a root) or
344 * appending to a non leaf btree node, we need either FUA or a flush
345 * when we write the parent with the new pointer. FUA is cheaper than a
346 * flush, and writes appending to leaf nodes aren't blocking anything so
347 * just make all btree node writes FUA to keep things sane.
348 */
349
333 bkey_copy(&k.key, &b->key); 350 bkey_copy(&k.key, &b->key);
334 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 351 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
335 352
336 if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { 353 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
337 int j; 354 int j;
338 struct bio_vec *bv; 355 struct bio_vec *bv;
339 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 356 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b)
342 memcpy(page_address(bv->bv_page), 359 memcpy(page_address(bv->bv_page),
343 base + j * PAGE_SIZE, PAGE_SIZE); 360 base + j * PAGE_SIZE, PAGE_SIZE);
344 361
345 trace_bcache_btree_write(b->bio);
346 bch_submit_bbio(b->bio, b->c, &k.key, 0); 362 bch_submit_bbio(b->bio, b->c, &k.key, 0);
347 363
348 continue_at(cl, btree_write_done, NULL); 364 continue_at(cl, btree_node_write_done, NULL);
349 } else { 365 } else {
350 b->bio->bi_vcnt = 0; 366 b->bio->bi_vcnt = 0;
351 bch_bio_map(b->bio, i); 367 bch_bio_map(b->bio, i);
352 368
353 trace_bcache_btree_write(b->bio);
354 bch_submit_bbio(b->bio, b->c, &k.key, 0); 369 bch_submit_bbio(b->bio, b->c, &k.key, 0);
355 370
356 closure_sync(cl); 371 closure_sync(cl);
357 __btree_write_done(cl); 372 __btree_node_write_done(cl);
358 } 373 }
359} 374}
360 375
361static void __btree_write(struct btree *b) 376void bch_btree_node_write(struct btree *b, struct closure *parent)
362{ 377{
363 struct bset *i = b->sets[b->nsets].data; 378 struct bset *i = b->sets[b->nsets].data;
364 379
380 trace_bcache_btree_write(b);
381
365 BUG_ON(current->bio_list); 382 BUG_ON(current->bio_list);
383 BUG_ON(b->written >= btree_blocks(b));
384 BUG_ON(b->written && !i->keys);
385 BUG_ON(b->sets->data->seq != i->seq);
386 bch_check_key_order(b, i);
366 387
367 closure_lock(&b->io, &b->c->cl);
368 cancel_delayed_work(&b->work); 388 cancel_delayed_work(&b->work);
369 389
390 /* If caller isn't waiting for write, parent refcount is cache set */
391 closure_lock(&b->io, parent ?: &b->c->cl);
392
370 clear_bit(BTREE_NODE_dirty, &b->flags); 393 clear_bit(BTREE_NODE_dirty, &b->flags);
371 change_bit(BTREE_NODE_write_idx, &b->flags); 394 change_bit(BTREE_NODE_write_idx, &b->flags);
372 395
373 bch_check_key_order(b, i); 396 do_btree_node_write(b);
374 BUG_ON(b->written && !i->keys);
375
376 do_btree_write(b);
377
378 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
379 397
380 b->written += set_blocks(i, b->c); 398 b->written += set_blocks(i, b->c);
381 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, 399 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
@@ -387,37 +405,31 @@ static void __btree_write(struct btree *b)
387 bch_bset_init_next(b); 405 bch_bset_init_next(b);
388} 406}
389 407
390static void btree_write_work(struct work_struct *w) 408static void btree_node_write_work(struct work_struct *w)
391{ 409{
392 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 410 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
393 411
394 down_write(&b->lock); 412 rw_lock(true, b, b->level);
395 413
396 if (btree_node_dirty(b)) 414 if (btree_node_dirty(b))
397 __btree_write(b); 415 bch_btree_node_write(b, NULL);
398 up_write(&b->lock); 416 rw_unlock(true, b);
399} 417}
400 418
401void bch_btree_write(struct btree *b, bool now, struct btree_op *op) 419static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
402{ 420{
403 struct bset *i = b->sets[b->nsets].data; 421 struct bset *i = b->sets[b->nsets].data;
404 struct btree_write *w = btree_current_write(b); 422 struct btree_write *w = btree_current_write(b);
405 423
406 BUG_ON(b->written && 424 BUG_ON(!b->written);
407 (b->written >= btree_blocks(b) || 425 BUG_ON(!i->keys);
408 i->seq != b->sets[0].data->seq ||
409 !i->keys));
410 426
411 if (!btree_node_dirty(b)) { 427 if (!btree_node_dirty(b))
412 set_btree_node_dirty(b); 428 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
413 queue_delayed_work(btree_io_wq, &b->work,
414 msecs_to_jiffies(30000));
415 }
416 429
417 w->prio_blocked += b->prio_blocked; 430 set_btree_node_dirty(b);
418 b->prio_blocked = 0;
419 431
420 if (op && op->journal && !b->level) { 432 if (op && op->journal) {
421 if (w->journal && 433 if (w->journal &&
422 journal_pin_cmp(b->c, w, op)) { 434 journal_pin_cmp(b->c, w, op)) {
423 atomic_dec_bug(w->journal); 435 atomic_dec_bug(w->journal);
@@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
430 } 442 }
431 } 443 }
432 444
433 if (current->bio_list)
434 return;
435
436 /* Force write if set is too big */ 445 /* Force write if set is too big */
437 if (now || 446 if (set_bytes(i) > PAGE_SIZE - 48 &&
438 b->level || 447 !current->bio_list)
439 set_bytes(i) > PAGE_SIZE - 48) { 448 bch_btree_node_write(b, NULL);
440 if (op && now) {
441 /* Must wait on multiple writes */
442 BUG_ON(w->owner);
443 w->owner = &op->cl;
444 closure_get(&op->cl);
445 }
446
447 __btree_write(b);
448 }
449 BUG_ON(!b->written);
450} 449}
451 450
452/* 451/*
@@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
559 init_rwsem(&b->lock); 558 init_rwsem(&b->lock);
560 lockdep_set_novalidate_class(&b->lock); 559 lockdep_set_novalidate_class(&b->lock);
561 INIT_LIST_HEAD(&b->list); 560 INIT_LIST_HEAD(&b->list);
562 INIT_DELAYED_WORK(&b->work, btree_write_work); 561 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
563 b->c = c; 562 b->c = c;
564 closure_init_unlocked(&b->io); 563 closure_init_unlocked(&b->io);
565 564
@@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
582 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
583 582
584 if (cl && btree_node_dirty(b)) 583 if (cl && btree_node_dirty(b))
585 bch_btree_write(b, true, NULL); 584 bch_btree_node_write(b, NULL);
586 585
587 if (cl) 586 if (cl)
588 closure_wait_event_async(&b->io.wait, cl, 587 closure_wait_event_async(&b->io.wait, cl,
@@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
623 else if (!mutex_trylock(&c->bucket_lock)) 622 else if (!mutex_trylock(&c->bucket_lock))
624 return -1; 623 return -1;
625 624
625 /*
626 * It's _really_ critical that we don't free too many btree nodes - we
627 * have to always leave ourselves a reserve. The reserve is how we
628 * guarantee that allocating memory for a new btree node can always
629 * succeed, so that inserting keys into the btree can always succeed and
630 * IO can always make forward progress:
631 */
626 nr /= c->btree_pages; 632 nr /= c->btree_pages;
627 nr = min_t(unsigned long, nr, mca_can_free(c)); 633 nr = min_t(unsigned long, nr, mca_can_free(c));
628 634
@@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
766 int ret = -ENOMEM; 772 int ret = -ENOMEM;
767 struct btree *i; 773 struct btree *i;
768 774
775 trace_bcache_btree_cache_cannibalize(c);
776
769 if (!cl) 777 if (!cl)
770 return ERR_PTR(-ENOMEM); 778 return ERR_PTR(-ENOMEM);
771 779
@@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
784 return ERR_PTR(-EAGAIN); 792 return ERR_PTR(-EAGAIN);
785 } 793 }
786 794
787 /* XXX: tracepoint */
788 c->try_harder = cl; 795 c->try_harder = cl;
789 c->try_harder_start = local_clock(); 796 c->try_harder_start = local_clock();
790retry: 797retry:
@@ -905,6 +912,9 @@ retry:
905 b = mca_find(c, k); 912 b = mca_find(c, k);
906 913
907 if (!b) { 914 if (!b) {
915 if (current->bio_list)
916 return ERR_PTR(-EAGAIN);
917
908 mutex_lock(&c->bucket_lock); 918 mutex_lock(&c->bucket_lock);
909 b = mca_alloc(c, k, level, &op->cl); 919 b = mca_alloc(c, k, level, &op->cl);
910 mutex_unlock(&c->bucket_lock); 920 mutex_unlock(&c->bucket_lock);
@@ -914,7 +924,7 @@ retry:
914 if (IS_ERR(b)) 924 if (IS_ERR(b))
915 return b; 925 return b;
916 926
917 bch_btree_read(b); 927 bch_btree_node_read(b);
918 928
919 if (!write) 929 if (!write)
920 downgrade_write(&b->lock); 930 downgrade_write(&b->lock);
@@ -937,15 +947,12 @@ retry:
937 for (; i <= b->nsets; i++) 947 for (; i <= b->nsets; i++)
938 prefetch(b->sets[i].data); 948 prefetch(b->sets[i].data);
939 949
940 if (!closure_wait_event(&b->io.wait, &op->cl, 950 if (btree_node_io_error(b)) {
941 btree_node_read_done(b))) {
942 rw_unlock(write, b);
943 b = ERR_PTR(-EAGAIN);
944 } else if (btree_node_io_error(b)) {
945 rw_unlock(write, b); 951 rw_unlock(write, b);
946 b = ERR_PTR(-EIO); 952 return ERR_PTR(-EIO);
947 } else 953 }
948 BUG_ON(!b->written); 954
955 BUG_ON(!b->written);
949 956
950 return b; 957 return b;
951} 958}
@@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
959 mutex_unlock(&c->bucket_lock); 966 mutex_unlock(&c->bucket_lock);
960 967
961 if (!IS_ERR_OR_NULL(b)) { 968 if (!IS_ERR_OR_NULL(b)) {
962 bch_btree_read(b); 969 bch_btree_node_read(b);
963 rw_unlock(true, b); 970 rw_unlock(true, b);
964 } 971 }
965} 972}
@@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
970{ 977{
971 unsigned i; 978 unsigned i;
972 979
980 trace_bcache_btree_node_free(b);
981
973 /* 982 /*
974 * The BUG_ON() in btree_node_get() implies that we must have a write 983 * The BUG_ON() in btree_node_get() implies that we must have a write
975 * lock on parent to free or even invalidate a node 984 * lock on parent to free or even invalidate a node
976 */ 985 */
977 BUG_ON(op->lock <= b->level); 986 BUG_ON(op->lock <= b->level);
978 BUG_ON(b == b->c->root); 987 BUG_ON(b == b->c->root);
979 pr_debug("bucket %s", pbtree(b));
980 988
981 if (btree_node_dirty(b)) 989 if (btree_node_dirty(b))
982 btree_complete_write(b, btree_current_write(b)); 990 btree_complete_write(b, btree_current_write(b));
983 clear_bit(BTREE_NODE_dirty, &b->flags); 991 clear_bit(BTREE_NODE_dirty, &b->flags);
984 992
985 if (b->prio_blocked &&
986 !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
987 wake_up(&b->c->alloc_wait);
988
989 b->prio_blocked = 0;
990
991 cancel_delayed_work(&b->work); 993 cancel_delayed_work(&b->work);
992 994
993 mutex_lock(&b->c->bucket_lock); 995 mutex_lock(&b->c->bucket_lock);
@@ -1028,17 +1030,20 @@ retry:
1028 goto retry; 1030 goto retry;
1029 } 1031 }
1030 1032
1031 set_btree_node_read_done(b);
1032 b->accessed = 1; 1033 b->accessed = 1;
1033 bch_bset_init_next(b); 1034 bch_bset_init_next(b);
1034 1035
1035 mutex_unlock(&c->bucket_lock); 1036 mutex_unlock(&c->bucket_lock);
1037
1038 trace_bcache_btree_node_alloc(b);
1036 return b; 1039 return b;
1037err_free: 1040err_free:
1038 bch_bucket_free(c, &k.key); 1041 bch_bucket_free(c, &k.key);
1039 __bkey_put(c, &k.key); 1042 __bkey_put(c, &k.key);
1040err: 1043err:
1041 mutex_unlock(&c->bucket_lock); 1044 mutex_unlock(&c->bucket_lock);
1045
1046 trace_bcache_btree_node_alloc_fail(b);
1042 return b; 1047 return b;
1043} 1048}
1044 1049
@@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1137 gc->nkeys++; 1142 gc->nkeys++;
1138 1143
1139 gc->data += KEY_SIZE(k); 1144 gc->data += KEY_SIZE(k);
1140 if (KEY_DIRTY(k)) { 1145 if (KEY_DIRTY(k))
1141 gc->dirty += KEY_SIZE(k); 1146 gc->dirty += KEY_SIZE(k);
1142 if (d)
1143 d->sectors_dirty_gc += KEY_SIZE(k);
1144 }
1145 } 1147 }
1146 1148
1147 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1149 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1166 1168
1167 if (!IS_ERR_OR_NULL(n)) { 1169 if (!IS_ERR_OR_NULL(n)) {
1168 swap(b, n); 1170 swap(b, n);
1171 __bkey_put(b->c, &b->key);
1169 1172
1170 memcpy(k->ptr, b->key.ptr, 1173 memcpy(k->ptr, b->key.ptr,
1171 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1174 sizeof(uint64_t) * KEY_PTRS(&b->key));
1172 1175
1173 __bkey_put(b->c, &b->key);
1174 atomic_inc(&b->c->prio_blocked);
1175 b->prio_blocked++;
1176
1177 btree_node_free(n, op); 1176 btree_node_free(n, op);
1178 up_write(&n->lock); 1177 up_write(&n->lock);
1179 } 1178 }
@@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1278 btree_node_free(r->b, op); 1277 btree_node_free(r->b, op);
1279 up_write(&r->b->lock); 1278 up_write(&r->b->lock);
1280 1279
1281 pr_debug("coalesced %u nodes", nodes); 1280 trace_bcache_btree_gc_coalesce(nodes);
1282 1281
1283 gc->nodes--; 1282 gc->nodes--;
1284 nodes--; 1283 nodes--;
@@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1293 void write(struct btree *r) 1292 void write(struct btree *r)
1294 { 1293 {
1295 if (!r->written) 1294 if (!r->written)
1296 bch_btree_write(r, true, op); 1295 bch_btree_node_write(r, &op->cl);
1297 else if (btree_node_dirty(r)) { 1296 else if (btree_node_dirty(r))
1298 BUG_ON(btree_current_write(r)->owner); 1297 bch_btree_node_write(r, writes);
1299 btree_current_write(r)->owner = writes;
1300 closure_get(writes);
1301
1302 bch_btree_write(r, true, NULL);
1303 }
1304 1298
1305 up_write(&r->lock); 1299 up_write(&r->lock);
1306 } 1300 }
@@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1386 ret = btree_gc_recurse(b, op, writes, gc); 1380 ret = btree_gc_recurse(b, op, writes, gc);
1387 1381
1388 if (!b->written || btree_node_dirty(b)) { 1382 if (!b->written || btree_node_dirty(b)) {
1389 atomic_inc(&b->c->prio_blocked); 1383 bch_btree_node_write(b, n ? &op->cl : NULL);
1390 b->prio_blocked++;
1391 bch_btree_write(b, true, n ? op : NULL);
1392 } 1384 }
1393 1385
1394 if (!IS_ERR_OR_NULL(n)) { 1386 if (!IS_ERR_OR_NULL(n)) {
@@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c)
1405{ 1397{
1406 struct cache *ca; 1398 struct cache *ca;
1407 struct bucket *b; 1399 struct bucket *b;
1408 struct bcache_device **d;
1409 unsigned i; 1400 unsigned i;
1410 1401
1411 if (!c->gc_mark_valid) 1402 if (!c->gc_mark_valid)
@@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c)
1419 for_each_cache(ca, c, i) 1410 for_each_cache(ca, c, i)
1420 for_each_bucket(b, ca) { 1411 for_each_bucket(b, ca) {
1421 b->gc_gen = b->gen; 1412 b->gc_gen = b->gen;
1422 if (!atomic_read(&b->pin)) 1413 if (!atomic_read(&b->pin)) {
1423 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 1414 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1415 SET_GC_SECTORS_USED(b, 0);
1416 }
1424 } 1417 }
1425 1418
1426 for (d = c->devices;
1427 d < c->devices + c->nr_uuids;
1428 d++)
1429 if (*d)
1430 (*d)->sectors_dirty_gc = 0;
1431
1432 mutex_unlock(&c->bucket_lock); 1419 mutex_unlock(&c->bucket_lock);
1433} 1420}
1434 1421
@@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1437 size_t available = 0; 1424 size_t available = 0;
1438 struct bucket *b; 1425 struct bucket *b;
1439 struct cache *ca; 1426 struct cache *ca;
1440 struct bcache_device **d;
1441 unsigned i; 1427 unsigned i;
1442 1428
1443 mutex_lock(&c->bucket_lock); 1429 mutex_lock(&c->bucket_lock);
@@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1480 } 1466 }
1481 } 1467 }
1482 1468
1483 for (d = c->devices;
1484 d < c->devices + c->nr_uuids;
1485 d++)
1486 if (*d) {
1487 unsigned long last =
1488 atomic_long_read(&((*d)->sectors_dirty));
1489 long difference = (*d)->sectors_dirty_gc - last;
1490
1491 pr_debug("sectors dirty off by %li", difference);
1492
1493 (*d)->sectors_dirty_last += difference;
1494
1495 atomic_long_set(&((*d)->sectors_dirty),
1496 (*d)->sectors_dirty_gc);
1497 }
1498
1499 mutex_unlock(&c->bucket_lock); 1469 mutex_unlock(&c->bucket_lock);
1500 return available; 1470 return available;
1501} 1471}
@@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl)
1508 struct gc_stat stats; 1478 struct gc_stat stats;
1509 struct closure writes; 1479 struct closure writes;
1510 struct btree_op op; 1480 struct btree_op op;
1511
1512 uint64_t start_time = local_clock(); 1481 uint64_t start_time = local_clock();
1513 trace_bcache_gc_start(c->sb.set_uuid); 1482
1514 blktrace_msg_all(c, "Starting gc"); 1483 trace_bcache_gc_start(c);
1515 1484
1516 memset(&stats, 0, sizeof(struct gc_stat)); 1485 memset(&stats, 0, sizeof(struct gc_stat));
1517 closure_init_stack(&writes); 1486 closure_init_stack(&writes);
@@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl)
1520 1489
1521 btree_gc_start(c); 1490 btree_gc_start(c);
1522 1491
1492 atomic_inc(&c->prio_blocked);
1493
1523 ret = btree_root(gc_root, c, &op, &writes, &stats); 1494 ret = btree_root(gc_root, c, &op, &writes, &stats);
1524 closure_sync(&op.cl); 1495 closure_sync(&op.cl);
1525 closure_sync(&writes); 1496 closure_sync(&writes);
1526 1497
1527 if (ret) { 1498 if (ret) {
1528 blktrace_msg_all(c, "Stopped gc");
1529 pr_warn("gc failed!"); 1499 pr_warn("gc failed!");
1530
1531 continue_at(cl, bch_btree_gc, bch_gc_wq); 1500 continue_at(cl, bch_btree_gc, bch_gc_wq);
1532 } 1501 }
1533 1502
@@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl)
1537 1506
1538 available = bch_btree_gc_finish(c); 1507 available = bch_btree_gc_finish(c);
1539 1508
1509 atomic_dec(&c->prio_blocked);
1510 wake_up_allocators(c);
1511
1540 bch_time_stats_update(&c->btree_gc_time, start_time); 1512 bch_time_stats_update(&c->btree_gc_time, start_time);
1541 1513
1542 stats.key_bytes *= sizeof(uint64_t); 1514 stats.key_bytes *= sizeof(uint64_t);
@@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl)
1544 stats.data <<= 9; 1516 stats.data <<= 9;
1545 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1517 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1546 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1518 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1547 blktrace_msg_all(c, "Finished gc");
1548 1519
1549 trace_bcache_gc_end(c->sb.set_uuid); 1520 trace_bcache_gc_end(c);
1550 wake_up(&c->alloc_wait);
1551 1521
1552 continue_at(cl, bch_moving_gc, bch_gc_wq); 1522 continue_at(cl, bch_moving_gc, bch_gc_wq);
1553} 1523}
@@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b,
1654 struct btree_iter *iter, 1624 struct btree_iter *iter,
1655 struct btree_op *op) 1625 struct btree_op *op)
1656{ 1626{
1657 void subtract_dirty(struct bkey *k, int sectors) 1627 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1658 { 1628 {
1659 struct bcache_device *d = b->c->devices[KEY_INODE(k)]; 1629 if (KEY_DIRTY(k))
1660 1630 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1661 if (KEY_DIRTY(k) && d) 1631 offset, -sectors);
1662 atomic_long_sub(sectors, &d->sectors_dirty);
1663 } 1632 }
1664 1633
1634 uint64_t old_offset;
1665 unsigned old_size, sectors_found = 0; 1635 unsigned old_size, sectors_found = 0;
1666 1636
1667 while (1) { 1637 while (1) {
@@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b,
1673 if (bkey_cmp(k, &START_KEY(insert)) <= 0) 1643 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1674 continue; 1644 continue;
1675 1645
1646 old_offset = KEY_START(k);
1676 old_size = KEY_SIZE(k); 1647 old_size = KEY_SIZE(k);
1677 1648
1678 /* 1649 /*
@@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b,
1728 1699
1729 struct bkey *top; 1700 struct bkey *top;
1730 1701
1731 subtract_dirty(k, KEY_SIZE(insert)); 1702 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1732 1703
1733 if (bkey_written(b, k)) { 1704 if (bkey_written(b, k)) {
1734 /* 1705 /*
@@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b,
1775 } 1746 }
1776 } 1747 }
1777 1748
1778 subtract_dirty(k, old_size - KEY_SIZE(k)); 1749 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1779 } 1750 }
1780 1751
1781check_failed: 1752check_failed:
@@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1798{ 1769{
1799 struct bset *i = b->sets[b->nsets].data; 1770 struct bset *i = b->sets[b->nsets].data;
1800 struct bkey *m, *prev; 1771 struct bkey *m, *prev;
1801 const char *status = "insert"; 1772 unsigned status = BTREE_INSERT_STATUS_INSERT;
1802 1773
1803 BUG_ON(bkey_cmp(k, &b->key) > 0); 1774 BUG_ON(bkey_cmp(k, &b->key) > 0);
1804 BUG_ON(b->level && !KEY_PTRS(k)); 1775 BUG_ON(b->level && !KEY_PTRS(k));
@@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1831 goto insert; 1802 goto insert;
1832 1803
1833 /* prev is in the tree, if we merge we're done */ 1804 /* prev is in the tree, if we merge we're done */
1834 status = "back merging"; 1805 status = BTREE_INSERT_STATUS_BACK_MERGE;
1835 if (prev && 1806 if (prev &&
1836 bch_bkey_try_merge(b, prev, k)) 1807 bch_bkey_try_merge(b, prev, k))
1837 goto merged; 1808 goto merged;
1838 1809
1839 status = "overwrote front"; 1810 status = BTREE_INSERT_STATUS_OVERWROTE;
1840 if (m != end(i) && 1811 if (m != end(i) &&
1841 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 1812 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1842 goto copy; 1813 goto copy;
1843 1814
1844 status = "front merge"; 1815 status = BTREE_INSERT_STATUS_FRONT_MERGE;
1845 if (m != end(i) && 1816 if (m != end(i) &&
1846 bch_bkey_try_merge(b, k, m)) 1817 bch_bkey_try_merge(b, k, m))
1847 goto copy; 1818 goto copy;
@@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1851insert: shift_keys(b, m, k); 1822insert: shift_keys(b, m, k);
1852copy: bkey_copy(m, k); 1823copy: bkey_copy(m, k);
1853merged: 1824merged:
1854 bch_check_keys(b, "%s for %s at %s: %s", status, 1825 if (KEY_DIRTY(k))
1855 op_type(op), pbtree(b), pkey(k)); 1826 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1856 bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, 1827 KEY_START(k), KEY_SIZE(k));
1857 op_type(op), pbtree(b), pkey(k)); 1828
1829 bch_check_keys(b, "%u for %s", status, op_type(op));
1858 1830
1859 if (b->level && !KEY_OFFSET(k)) 1831 if (b->level && !KEY_OFFSET(k))
1860 b->prio_blocked++; 1832 btree_current_write(b)->prio_blocked++;
1861 1833
1862 pr_debug("%s for %s at %s: %s", status, 1834 trace_bcache_btree_insert_key(b, k, op->type, status);
1863 op_type(op), pbtree(b), pkey(k));
1864 1835
1865 return true; 1836 return true;
1866} 1837}
1867 1838
1868bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1839static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
1869{ 1840{
1870 bool ret = false; 1841 bool ret = false;
1871 struct bkey *k; 1842 struct bkey *k;
@@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1896 should_split(b)) 1867 should_split(b))
1897 goto out; 1868 goto out;
1898 1869
1899 op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); 1870 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
1900 1871
1901 SET_KEY_PTRS(&op->replace, 1); 1872 SET_KEY_PTRS(&op->replace, 1);
1902 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 1873 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
@@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1907 1878
1908 BUG_ON(op->type != BTREE_INSERT); 1879 BUG_ON(op->type != BTREE_INSERT);
1909 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1880 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1910 bch_btree_write(b, false, NULL);
1911 ret = true; 1881 ret = true;
1912out: 1882out:
1913 downgrade_write(&b->lock); 1883 downgrade_write(&b->lock);
@@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op)
1929 1899
1930 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1900 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1931 1901
1932 pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
1933 pbtree(b), n1->sets[0].data->keys);
1934
1935 if (split) { 1902 if (split) {
1936 unsigned keys = 0; 1903 unsigned keys = 0;
1937 1904
1905 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1906
1938 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 1907 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
1939 if (IS_ERR(n2)) 1908 if (IS_ERR(n2))
1940 goto err_free1; 1909 goto err_free1;
@@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op)
1967 bkey_copy_key(&n2->key, &b->key); 1936 bkey_copy_key(&n2->key, &b->key);
1968 1937
1969 bch_keylist_add(&op->keys, &n2->key); 1938 bch_keylist_add(&op->keys, &n2->key);
1970 bch_btree_write(n2, true, op); 1939 bch_btree_node_write(n2, &op->cl);
1971 rw_unlock(true, n2); 1940 rw_unlock(true, n2);
1972 } else 1941 } else {
1942 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1943
1973 bch_btree_insert_keys(n1, op); 1944 bch_btree_insert_keys(n1, op);
1945 }
1974 1946
1975 bch_keylist_add(&op->keys, &n1->key); 1947 bch_keylist_add(&op->keys, &n1->key);
1976 bch_btree_write(n1, true, op); 1948 bch_btree_node_write(n1, &op->cl);
1977 1949
1978 if (n3) { 1950 if (n3) {
1979 bkey_copy_key(&n3->key, &MAX_KEY); 1951 bkey_copy_key(&n3->key, &MAX_KEY);
1980 bch_btree_insert_keys(n3, op); 1952 bch_btree_insert_keys(n3, op);
1981 bch_btree_write(n3, true, op); 1953 bch_btree_node_write(n3, &op->cl);
1982 1954
1983 closure_sync(&op->cl); 1955 closure_sync(&op->cl);
1984 bch_btree_set_root(n3); 1956 bch_btree_set_root(n3);
@@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2082 2054
2083 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2055 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2084 2056
2085 if (bch_btree_insert_keys(b, op)) 2057 if (bch_btree_insert_keys(b, op)) {
2086 bch_btree_write(b, false, op); 2058 if (!b->level)
2059 bch_btree_leaf_dirty(b, op);
2060 else
2061 bch_btree_node_write(b, &op->cl);
2062 }
2087 } 2063 }
2088 2064
2089 return 0; 2065 return 0;
@@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
2140void bch_btree_set_root(struct btree *b) 2116void bch_btree_set_root(struct btree *b)
2141{ 2117{
2142 unsigned i; 2118 unsigned i;
2119 struct closure cl;
2120
2121 closure_init_stack(&cl);
2122
2123 trace_bcache_btree_set_root(b);
2143 2124
2144 BUG_ON(!b->written); 2125 BUG_ON(!b->written);
2145 2126
@@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b)
2153 b->c->root = b; 2134 b->c->root = b;
2154 __bkey_put(b->c, &b->key); 2135 __bkey_put(b->c, &b->key);
2155 2136
2156 bch_journal_meta(b->c, NULL); 2137 bch_journal_meta(b->c, &cl);
2157 pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); 2138 closure_sync(&cl);
2158} 2139}
2159 2140
2160/* Cache lookup */ 2141/* Cache lookup */
@@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2215 KEY_OFFSET(k) - bio->bi_sector); 2196 KEY_OFFSET(k) - bio->bi_sector);
2216 2197
2217 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 2198 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2218 if (!n)
2219 return -EAGAIN;
2220
2221 if (n == bio) 2199 if (n == bio)
2222 op->lookup_done = true; 2200 op->lookup_done = true;
2223 2201
@@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2240 n->bi_end_io = bch_cache_read_endio; 2218 n->bi_end_io = bch_cache_read_endio;
2241 n->bi_private = &s->cl; 2219 n->bi_private = &s->cl;
2242 2220
2243 trace_bcache_cache_hit(n);
2244 __bch_submit_bbio(n, b->c); 2221 __bch_submit_bbio(n, b->c);
2245 } 2222 }
2246 2223
@@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2257 struct btree_iter iter; 2234 struct btree_iter iter;
2258 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); 2235 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2259 2236
2260 pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
2261 (uint64_t) bio->bi_sector);
2262
2263 do { 2237 do {
2264 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 2238 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2265 if (!k) { 2239 if (!k) {
@@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2303} 2277}
2304 2278
2305static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2279static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2306 struct keybuf *buf, struct bkey *end) 2280 struct keybuf *buf, struct bkey *end,
2281 keybuf_pred_fn *pred)
2307{ 2282{
2308 struct btree_iter iter; 2283 struct btree_iter iter;
2309 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2284 bch_btree_iter_init(b, &iter, &buf->last_scanned);
@@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2322 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2297 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2323 break; 2298 break;
2324 2299
2325 if (buf->key_predicate(buf, k)) { 2300 if (pred(buf, k)) {
2326 struct keybuf_key *w; 2301 struct keybuf_key *w;
2327 2302
2328 pr_debug("%s", pkey(k));
2329
2330 spin_lock(&buf->lock); 2303 spin_lock(&buf->lock);
2331 2304
2332 w = array_alloc(&buf->freelist); 2305 w = array_alloc(&buf->freelist);
@@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2343 if (!k) 2316 if (!k)
2344 break; 2317 break;
2345 2318
2346 btree(refill_keybuf, k, b, op, buf, end); 2319 btree(refill_keybuf, k, b, op, buf, end, pred);
2347 /* 2320 /*
2348 * Might get an error here, but can't really do anything 2321 * Might get an error here, but can't really do anything
2349 * and it'll get logged elsewhere. Just read what we 2322 * and it'll get logged elsewhere. Just read what we
@@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2361} 2334}
2362 2335
2363void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2336void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2364 struct bkey *end) 2337 struct bkey *end, keybuf_pred_fn *pred)
2365{ 2338{
2366 struct bkey start = buf->last_scanned; 2339 struct bkey start = buf->last_scanned;
2367 struct btree_op op; 2340 struct btree_op op;
@@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2369 2342
2370 cond_resched(); 2343 cond_resched();
2371 2344
2372 btree_root(refill_keybuf, c, &op, buf, end); 2345 btree_root(refill_keybuf, c, &op, buf, end, pred);
2373 closure_sync(&op.cl); 2346 closure_sync(&op.cl);
2374 2347
2375 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2348 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
@@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2455 2428
2456struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2429struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2457 struct keybuf *buf, 2430 struct keybuf *buf,
2458 struct bkey *end) 2431 struct bkey *end,
2432 keybuf_pred_fn *pred)
2459{ 2433{
2460 struct keybuf_key *ret; 2434 struct keybuf_key *ret;
2461 2435
@@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2469 break; 2443 break;
2470 } 2444 }
2471 2445
2472 bch_refill_keybuf(c, buf, end); 2446 bch_refill_keybuf(c, buf, end, pred);
2473 } 2447 }
2474 2448
2475 return ret; 2449 return ret;
2476} 2450}
2477 2451
2478void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) 2452void bch_keybuf_init(struct keybuf *buf)
2479{ 2453{
2480 buf->key_predicate = fn;
2481 buf->last_scanned = MAX_KEY; 2454 buf->last_scanned = MAX_KEY;
2482 buf->keys = RB_ROOT; 2455 buf->keys = RB_ROOT;
2483 2456
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index af4a7092a28c..3333d3723633 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -102,7 +102,6 @@
102#include "debug.h" 102#include "debug.h"
103 103
104struct btree_write { 104struct btree_write {
105 struct closure *owner;
106 atomic_t *journal; 105 atomic_t *journal;
107 106
108 /* If btree_split() frees a btree node, it writes a new pointer to that 107 /* If btree_split() frees a btree node, it writes a new pointer to that
@@ -142,16 +141,12 @@ struct btree {
142 */ 141 */
143 struct bset_tree sets[MAX_BSETS]; 142 struct bset_tree sets[MAX_BSETS];
144 143
145 /* Used to refcount bio splits, also protects b->bio */ 144 /* For outstanding btree writes, used as a lock - protects write_idx */
146 struct closure_with_waitlist io; 145 struct closure_with_waitlist io;
147 146
148 /* Gets transferred to w->prio_blocked - see the comment there */
149 int prio_blocked;
150
151 struct list_head list; 147 struct list_head list;
152 struct delayed_work work; 148 struct delayed_work work;
153 149
154 uint64_t io_start_time;
155 struct btree_write writes[2]; 150 struct btree_write writes[2];
156 struct bio *bio; 151 struct bio *bio;
157}; 152};
@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
164{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ 159{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
165 160
166enum btree_flags { 161enum btree_flags {
167 BTREE_NODE_read_done,
168 BTREE_NODE_io_error, 162 BTREE_NODE_io_error,
169 BTREE_NODE_dirty, 163 BTREE_NODE_dirty,
170 BTREE_NODE_write_idx, 164 BTREE_NODE_write_idx,
171}; 165};
172 166
173BTREE_FLAG(read_done);
174BTREE_FLAG(io_error); 167BTREE_FLAG(io_error);
175BTREE_FLAG(dirty); 168BTREE_FLAG(dirty);
176BTREE_FLAG(write_idx); 169BTREE_FLAG(write_idx);
@@ -278,6 +271,13 @@ struct btree_op {
278 BKEY_PADDED(replace); 271 BKEY_PADDED(replace);
279}; 272};
280 273
274enum {
275 BTREE_INSERT_STATUS_INSERT,
276 BTREE_INSERT_STATUS_BACK_MERGE,
277 BTREE_INSERT_STATUS_OVERWROTE,
278 BTREE_INSERT_STATUS_FRONT_MERGE,
279};
280
281void bch_btree_op_init_stack(struct btree_op *); 281void bch_btree_op_init_stack(struct btree_op *);
282 282
283static inline void rw_lock(bool w, struct btree *b, int level) 283static inline void rw_lock(bool w, struct btree *b, int level)
@@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b)
293#ifdef CONFIG_BCACHE_EDEBUG 293#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i; 294 unsigned i;
295 295
296 if (w && 296 if (w && b->key.ptr[0])
297 b->key.ptr[0] &&
298 btree_node_read_done(b))
299 for (i = 0; i <= b->nsets; i++) 297 for (i = 0; i <= b->nsets; i++)
300 bch_check_key_order(b, b->sets[i].data); 298 bch_check_key_order(b, b->sets[i].data);
301#endif 299#endif
@@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b)
370 > btree_blocks(b)); 368 > btree_blocks(b));
371} 369}
372 370
373void bch_btree_read_done(struct closure *); 371void bch_btree_node_read(struct btree *);
374void bch_btree_read(struct btree *); 372void bch_btree_node_write(struct btree *, struct closure *);
375void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
376 373
377void bch_cannibalize_unlock(struct cache_set *, struct closure *); 374void bch_cannibalize_unlock(struct cache_set *, struct closure *);
378void bch_btree_set_root(struct btree *); 375void bch_btree_set_root(struct btree *);
@@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
380struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, 377struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
381 int, struct btree_op *); 378 int, struct btree_op *);
382 379
383bool bch_btree_insert_keys(struct btree *, struct btree_op *);
384bool bch_btree_insert_check_key(struct btree *, struct btree_op *, 380bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
385 struct bio *); 381 struct bio *);
386int bch_btree_insert(struct btree_op *, struct cache_set *); 382int bch_btree_insert(struct btree_op *, struct cache_set *);
@@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *);
393int bch_btree_check(struct cache_set *, struct btree_op *); 389int bch_btree_check(struct cache_set *, struct btree_op *);
394uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); 390uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
395 391
396void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); 392void bch_keybuf_init(struct keybuf *);
397void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); 393void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
394 keybuf_pred_fn *);
398bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, 395bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
399 struct bkey *); 396 struct bkey *);
400void bch_keybuf_del(struct keybuf *, struct keybuf_key *); 397void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
401struct keybuf_key *bch_keybuf_next(struct keybuf *); 398struct keybuf_key *bch_keybuf_next(struct keybuf *);
402struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, 399struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
403 struct keybuf *, struct bkey *); 400 struct bkey *, keybuf_pred_fn *);
404 401
405#endif 402#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index bd05a9a8c7cf..9aba2017f0d1 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
66 } else { 66 } else {
67 struct closure *parent = cl->parent; 67 struct closure *parent = cl->parent;
68 struct closure_waitlist *wait = closure_waitlist(cl); 68 struct closure_waitlist *wait = closure_waitlist(cl);
69 closure_fn *destructor = cl->fn;
69 70
70 closure_debug_destroy(cl); 71 closure_debug_destroy(cl);
71 72
73 smp_mb();
72 atomic_set(&cl->remaining, -1); 74 atomic_set(&cl->remaining, -1);
73 75
74 if (wait) 76 if (wait)
75 closure_wake_up(wait); 77 closure_wake_up(wait);
76 78
77 if (cl->fn) 79 if (destructor)
78 cl->fn(cl); 80 destructor(cl);
79 81
80 if (parent) 82 if (parent)
81 closure_put(parent); 83 closure_put(parent);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 89fd5204924e..88e6411eab4f 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
47 return ""; 47 return "";
48} 48}
49 49
50struct keyprint_hack bch_pkey(const struct bkey *k) 50int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
51{ 51{
52 unsigned i = 0; 52 unsigned i = 0;
53 struct keyprint_hack r; 53 char *out = buf, *end = buf + size;
54 char *out = r.s, *end = r.s + KEYHACK_SIZE;
55 54
56#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) 55#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
57 56
@@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k)
75 if (KEY_CSUM(k)) 74 if (KEY_CSUM(k))
76 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); 75 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
77#undef p 76#undef p
78 return r; 77 return out - buf;
79} 78}
80 79
81struct keyprint_hack bch_pbtree(const struct btree *b) 80int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
82{ 81{
83 struct keyprint_hack r; 82 return scnprintf(buf, size, "%zu level %i/%i",
84 83 PTR_BUCKET_NR(b->c, &b->key, 0),
85 snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), 84 b->level, b->c->root ? b->c->root->level : -1);
86 b->level, b->c->root ? b->c->root->level : -1);
87 return r;
88} 85}
89 86
90#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) 87#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
@@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i)
100{ 97{
101 struct bkey *k; 98 struct bkey *k;
102 unsigned j; 99 unsigned j;
100 char buf[80];
103 101
104 for (k = i->start; k < end(i); k = bkey_next(k)) { 102 for (k = i->start; k < end(i); k = bkey_next(k)) {
103 bch_bkey_to_text(buf, sizeof(buf), k);
105 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), 104 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
106 (uint64_t *) k - i->d, i->keys, pkey(k)); 105 (uint64_t *) k - i->d, i->keys, buf);
107 106
108 for (j = 0; j < KEY_PTRS(k); j++) { 107 for (j = 0; j < KEY_PTRS(k); j++) {
109 size_t n = PTR_BUCKET_NR(b->c, k, j); 108 size_t n = PTR_BUCKET_NR(b->c, k, j);
@@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
144 v->written = 0; 143 v->written = 0;
145 v->level = b->level; 144 v->level = b->level;
146 145
147 bch_btree_read(v); 146 bch_btree_node_read(v);
148 closure_wait_event(&v->io.wait, &cl, 147 closure_wait_event(&v->io.wait, &cl,
149 atomic_read(&b->io.cl.remaining) == -1); 148 atomic_read(&b->io.cl.remaining) == -1);
150 149
@@ -200,7 +199,7 @@ void bch_data_verify(struct search *s)
200 if (!check) 199 if (!check)
201 return; 200 return;
202 201
203 if (bch_bio_alloc_pages(check, GFP_NOIO)) 202 if (bio_alloc_pages(check, GFP_NOIO))
204 goto out_put; 203 goto out_put;
205 204
206 check->bi_rw = READ_SYNC; 205 check->bi_rw = READ_SYNC;
@@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
252 va_list args) 251 va_list args)
253{ 252{
254 unsigned i; 253 unsigned i;
254 char buf[80];
255 255
256 console_lock(); 256 console_lock();
257 257
@@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
262 262
263 console_unlock(); 263 console_unlock();
264 264
265 panic("at %s\n", pbtree(b)); 265 bch_btree_to_text(buf, sizeof(buf), b);
266 panic("at %s\n", buf);
266} 267}
267 268
268void bch_check_key_order_msg(struct btree *b, struct bset *i, 269void bch_check_key_order_msg(struct btree *b, struct bset *i,
@@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
337{ 338{
338 struct dump_iterator *i = file->private_data; 339 struct dump_iterator *i = file->private_data;
339 ssize_t ret = 0; 340 ssize_t ret = 0;
341 char kbuf[80];
340 342
341 while (size) { 343 while (size) {
342 struct keybuf_key *w; 344 struct keybuf_key *w;
@@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
355 if (i->bytes) 357 if (i->bytes)
356 break; 358 break;
357 359
358 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); 360 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
359 if (!w) 361 if (!w)
360 break; 362 break;
361 363
362 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); 364 bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
365 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
363 bch_keybuf_del(&i->keys, w); 366 bch_keybuf_del(&i->keys, w);
364 } 367 }
365 368
@@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file)
377 380
378 file->private_data = i; 381 file->private_data = i;
379 i->c = c; 382 i->c = c;
380 bch_keybuf_init(&i->keys, dump_pred); 383 bch_keybuf_init(&i->keys);
381 i->keys.last_scanned = KEY(0, 0, 0); 384 i->keys.last_scanned = KEY(0, 0, 0);
382 385
383 return 0; 386 return 0;
@@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c)
409 412
410#endif 413#endif
411 414
412/* Fuzz tester has rotted: */
413#if 0
414
415static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
416 const char *buffer, size_t size)
417{
418 void dump(struct btree *b)
419 {
420 struct bset *i;
421
422 for (i = b->sets[0].data;
423 index(i, b) < btree_blocks(b) &&
424 i->seq == b->sets[0].data->seq;
425 i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
426 dump_bset(b, i);
427 }
428
429 struct cache_sb *sb;
430 struct cache_set *c;
431 struct btree *all[3], *b, *fill, *orig;
432 int j;
433
434 struct btree_op op;
435 bch_btree_op_init_stack(&op);
436
437 sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
438 if (!sb)
439 return -ENOMEM;
440
441 sb->bucket_size = 128;
442 sb->block_size = 4;
443
444 c = bch_cache_set_alloc(sb);
445 if (!c)
446 return -ENOMEM;
447
448 for (j = 0; j < 3; j++) {
449 BUG_ON(list_empty(&c->btree_cache));
450 all[j] = list_first_entry(&c->btree_cache, struct btree, list);
451 list_del_init(&all[j]->list);
452
453 all[j]->key = KEY(0, 0, c->sb.bucket_size);
454 bkey_copy_key(&all[j]->key, &MAX_KEY);
455 }
456
457 b = all[0];
458 fill = all[1];
459 orig = all[2];
460
461 while (1) {
462 for (j = 0; j < 3; j++)
463 all[j]->written = all[j]->nsets = 0;
464
465 bch_bset_init_next(b);
466
467 while (1) {
468 struct bset *i = write_block(b);
469 struct bkey *k = op.keys.top;
470 unsigned rand;
471
472 bkey_init(k);
473 rand = get_random_int();
474
475 op.type = rand & 1
476 ? BTREE_INSERT
477 : BTREE_REPLACE;
478 rand >>= 1;
479
480 SET_KEY_SIZE(k, bucket_remainder(c, rand));
481 rand >>= c->bucket_bits;
482 rand &= 1024 * 512 - 1;
483 rand += c->sb.bucket_size;
484 SET_KEY_OFFSET(k, rand);
485#if 0
486 SET_KEY_PTRS(k, 1);
487#endif
488 bch_keylist_push(&op.keys);
489 bch_btree_insert_keys(b, &op);
490
491 if (should_split(b) ||
492 set_blocks(i, b->c) !=
493 __set_blocks(i, i->keys + 15, b->c)) {
494 i->csum = csum_set(i);
495
496 memcpy(write_block(fill),
497 i, set_bytes(i));
498
499 b->written += set_blocks(i, b->c);
500 fill->written = b->written;
501 if (b->written == btree_blocks(b))
502 break;
503
504 bch_btree_sort_lazy(b);
505 bch_bset_init_next(b);
506 }
507 }
508
509 memcpy(orig->sets[0].data,
510 fill->sets[0].data,
511 btree_bytes(c));
512
513 bch_btree_sort(b);
514 fill->written = 0;
515 bch_btree_read_done(&fill->io.cl);
516
517 if (b->sets[0].data->keys != fill->sets[0].data->keys ||
518 memcmp(b->sets[0].data->start,
519 fill->sets[0].data->start,
520 b->sets[0].data->keys * sizeof(uint64_t))) {
521 struct bset *i = b->sets[0].data;
522 struct bkey *k, *l;
523
524 for (k = i->start,
525 l = fill->sets[0].data->start;
526 k < end(i);
527 k = bkey_next(k), l = bkey_next(l))
528 if (bkey_cmp(k, l) ||
529 KEY_SIZE(k) != KEY_SIZE(l))
530 pr_err("key %zi differs: %s != %s",
531 (uint64_t *) k - i->d,
532 pkey(k), pkey(l));
533
534 for (j = 0; j < 3; j++) {
535 pr_err("**** Set %i ****", j);
536 dump(all[j]);
537 }
538 panic("\n");
539 }
540
541 pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
542 }
543}
544
545kobj_attribute_write(fuzz, btree_fuzz);
546#endif
547
548void bch_debug_exit(void) 415void bch_debug_exit(void)
549{ 416{
550 if (!IS_ERR_OR_NULL(debug)) 417 if (!IS_ERR_OR_NULL(debug))
@@ -554,11 +421,6 @@ void bch_debug_exit(void)
554int __init bch_debug_init(struct kobject *kobj) 421int __init bch_debug_init(struct kobject *kobj)
555{ 422{
556 int ret = 0; 423 int ret = 0;
557#if 0
558 ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
559 if (ret)
560 return ret;
561#endif
562 424
563 debug = debugfs_create_dir("bcache", NULL); 425 debug = debugfs_create_dir("bcache", NULL);
564 return ret; 426 return ret;
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index f9378a218148..1c39b5a2489b 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -3,15 +3,8 @@
3 3
4/* Btree/bkey debug printing */ 4/* Btree/bkey debug printing */
5 5
6#define KEYHACK_SIZE 80 6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
7struct keyprint_hack { 7int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
8 char s[KEYHACK_SIZE];
9};
10
11struct keyprint_hack bch_pkey(const struct bkey *k);
12struct keyprint_hack bch_pbtree(const struct btree *b);
13#define pkey(k) (&bch_pkey(k).s[0])
14#define pbtree(b) (&bch_pbtree(b).s[0])
15 8
16#ifdef CONFIG_BCACHE_EDEBUG 9#ifdef CONFIG_BCACHE_EDEBUG
17 10
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 48efd4dea645..9056632995b1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -9,6 +9,8 @@
9#include "bset.h" 9#include "bset.h"
10#include "debug.h" 10#include "debug.h"
11 11
12#include <linux/blkdev.h>
13
12static void bch_bi_idx_hack_endio(struct bio *bio, int error) 14static void bch_bi_idx_hack_endio(struct bio *bio, int error)
13{ 15{
14 struct bio *p = bio->bi_private; 16 struct bio *p = bio->bi_private;
@@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio)
66 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a 68 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
67 * bvec boundry; it is the caller's responsibility to ensure that @bio is not 69 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
68 * freed before the split. 70 * freed before the split.
69 *
70 * If bch_bio_split() is running under generic_make_request(), it's not safe to
71 * allocate more than one bio from the same bio set. Therefore, if it is running
72 * under generic_make_request() it masks out __GFP_WAIT when doing the
73 * allocation. The caller must check for failure if there's any possibility of
74 * it being called from under generic_make_request(); it is then the caller's
75 * responsibility to retry from a safe context (by e.g. punting to workqueue).
76 */ 71 */
77struct bio *bch_bio_split(struct bio *bio, int sectors, 72struct bio *bch_bio_split(struct bio *bio, int sectors,
78 gfp_t gfp, struct bio_set *bs) 73 gfp_t gfp, struct bio_set *bs)
@@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
83 78
84 BUG_ON(sectors <= 0); 79 BUG_ON(sectors <= 0);
85 80
86 /*
87 * If we're being called from underneath generic_make_request() and we
88 * already allocated any bios from this bio set, we risk deadlock if we
89 * use the mempool. So instead, we possibly fail and let the caller punt
90 * to workqueue or somesuch and retry in a safe context.
91 */
92 if (current->bio_list)
93 gfp &= ~__GFP_WAIT;
94
95 if (sectors >= bio_sectors(bio)) 81 if (sectors >= bio_sectors(bio))
96 return bio; 82 return bio;
97 83
98 if (bio->bi_rw & REQ_DISCARD) { 84 if (bio->bi_rw & REQ_DISCARD) {
99 ret = bio_alloc_bioset(gfp, 1, bs); 85 ret = bio_alloc_bioset(gfp, 1, bs);
86 if (!ret)
87 return NULL;
100 idx = 0; 88 idx = 0;
101 goto out; 89 goto out;
102 } 90 }
@@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
160 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 148 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
161 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, 149 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
162 queue_max_segments(q)); 150 queue_max_segments(q));
163 struct bio_vec *bv, *end = bio_iovec(bio) +
164 min_t(int, bio_segments(bio), max_segments);
165 151
166 if (bio->bi_rw & REQ_DISCARD) 152 if (bio->bi_rw & REQ_DISCARD)
167 return min(ret, q->limits.max_discard_sectors); 153 return min(ret, q->limits.max_discard_sectors);
168 154
169 if (bio_segments(bio) > max_segments || 155 if (bio_segments(bio) > max_segments ||
170 q->merge_bvec_fn) { 156 q->merge_bvec_fn) {
157 struct bio_vec *bv;
158 int i, seg = 0;
159
171 ret = 0; 160 ret = 0;
172 161
173 for (bv = bio_iovec(bio); bv < end; bv++) { 162 bio_for_each_segment(bv, bio, i) {
174 struct bvec_merge_data bvm = { 163 struct bvec_merge_data bvm = {
175 .bi_bdev = bio->bi_bdev, 164 .bi_bdev = bio->bi_bdev,
176 .bi_sector = bio->bi_sector, 165 .bi_sector = bio->bi_sector,
@@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
178 .bi_rw = bio->bi_rw, 167 .bi_rw = bio->bi_rw,
179 }; 168 };
180 169
170 if (seg == max_segments)
171 break;
172
181 if (q->merge_bvec_fn && 173 if (q->merge_bvec_fn &&
182 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) 174 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
183 break; 175 break;
184 176
177 seg++;
185 ret += bv->bv_len >> 9; 178 ret += bv->bv_len >> 9;
186 } 179 }
187 } 180 }
@@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error)
218 closure_put(cl); 211 closure_put(cl);
219} 212}
220 213
221static void __bch_bio_submit_split(struct closure *cl)
222{
223 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
224 struct bio *bio = s->bio, *n;
225
226 do {
227 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
228 GFP_NOIO, s->p->bio_split);
229 if (!n)
230 continue_at(cl, __bch_bio_submit_split, system_wq);
231
232 n->bi_end_io = bch_bio_submit_split_endio;
233 n->bi_private = cl;
234
235 closure_get(cl);
236 bch_generic_make_request_hack(n);
237 } while (n != bio);
238
239 continue_at(cl, bch_bio_submit_split_done, NULL);
240}
241
242void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) 214void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
243{ 215{
244 struct bio_split_hook *s; 216 struct bio_split_hook *s;
217 struct bio *n;
245 218
246 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) 219 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
247 goto submit; 220 goto submit;
@@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
250 goto submit; 223 goto submit;
251 224
252 s = mempool_alloc(p->bio_split_hook, GFP_NOIO); 225 s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
226 closure_init(&s->cl, NULL);
253 227
254 s->bio = bio; 228 s->bio = bio;
255 s->p = p; 229 s->p = p;
@@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
257 s->bi_private = bio->bi_private; 231 s->bi_private = bio->bi_private;
258 bio_get(bio); 232 bio_get(bio);
259 233
260 closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); 234 do {
261 return; 235 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
236 GFP_NOIO, s->p->bio_split);
237
238 n->bi_end_io = bch_bio_submit_split_endio;
239 n->bi_private = &s->cl;
240
241 closure_get(&s->cl);
242 bch_generic_make_request_hack(n);
243 } while (n != bio);
244
245 continue_at(&s->cl, bch_bio_submit_split_done, NULL);
262submit: 246submit:
263 bch_generic_make_request_hack(bio); 247 bch_generic_make_request_hack(bio);
264} 248}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8c8dfdcd9d4c..ba95ab84b2be 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -9,6 +9,8 @@
9#include "debug.h" 9#include "debug.h"
10#include "request.h" 10#include "request.h"
11 11
12#include <trace/events/bcache.h>
13
12/* 14/*
13 * Journal replay/recovery: 15 * Journal replay/recovery:
14 * 16 *
@@ -182,9 +184,14 @@ bsearch:
182 pr_debug("starting binary search, l %u r %u", l, r); 184 pr_debug("starting binary search, l %u r %u", l, r);
183 185
184 while (l + 1 < r) { 186 while (l + 1 < r) {
187 seq = list_entry(list->prev, struct journal_replay,
188 list)->j.seq;
189
185 m = (l + r) >> 1; 190 m = (l + r) >> 1;
191 read_bucket(m);
186 192
187 if (read_bucket(m)) 193 if (seq != list_entry(list->prev, struct journal_replay,
194 list)->j.seq)
188 l = m; 195 l = m;
189 else 196 else
190 r = m; 197 r = m;
@@ -300,7 +307,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
300 for (k = i->j.start; 307 for (k = i->j.start;
301 k < end(&i->j); 308 k < end(&i->j);
302 k = bkey_next(k)) { 309 k = bkey_next(k)) {
303 pr_debug("%s", pkey(k)); 310 trace_bcache_journal_replay_key(k);
311
304 bkey_copy(op->keys.top, k); 312 bkey_copy(op->keys.top, k);
305 bch_keylist_push(&op->keys); 313 bch_keylist_push(&op->keys);
306 314
@@ -384,7 +392,7 @@ out:
384 return; 392 return;
385found: 393found:
386 if (btree_node_dirty(best)) 394 if (btree_node_dirty(best))
387 bch_btree_write(best, true, NULL); 395 bch_btree_node_write(best, NULL);
388 rw_unlock(true, best); 396 rw_unlock(true, best);
389} 397}
390 398
@@ -617,7 +625,7 @@ static void journal_write_unlocked(struct closure *cl)
617 bio_reset(bio); 625 bio_reset(bio);
618 bio->bi_sector = PTR_OFFSET(k, i); 626 bio->bi_sector = PTR_OFFSET(k, i);
619 bio->bi_bdev = ca->bdev; 627 bio->bi_bdev = ca->bdev;
620 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; 628 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
621 bio->bi_size = sectors << 9; 629 bio->bi_size = sectors << 9;
622 630
623 bio->bi_end_io = journal_write_endio; 631 bio->bi_end_io = journal_write_endio;
@@ -712,7 +720,8 @@ void bch_journal(struct closure *cl)
712 spin_lock(&c->journal.lock); 720 spin_lock(&c->journal.lock);
713 721
714 if (journal_full(&c->journal)) { 722 if (journal_full(&c->journal)) {
715 /* XXX: tracepoint */ 723 trace_bcache_journal_full(c);
724
716 closure_wait(&c->journal.wait, cl); 725 closure_wait(&c->journal.wait, cl);
717 726
718 journal_reclaim(c); 727 journal_reclaim(c);
@@ -728,13 +737,15 @@ void bch_journal(struct closure *cl)
728 737
729 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || 738 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
730 b > c->journal.blocks_free) { 739 b > c->journal.blocks_free) {
731 /* XXX: If we were inserting so many keys that they won't fit in 740 trace_bcache_journal_entry_full(c);
741
742 /*
743 * XXX: If we were inserting so many keys that they won't fit in
732 * an _empty_ journal write, we'll deadlock. For now, handle 744 * an _empty_ journal write, we'll deadlock. For now, handle
733 * this in bch_keylist_realloc() - but something to think about. 745 * this in bch_keylist_realloc() - but something to think about.
734 */ 746 */
735 BUG_ON(!w->data->keys); 747 BUG_ON(!w->data->keys);
736 748
737 /* XXX: tracepoint */
738 BUG_ON(!closure_wait(&w->wait, cl)); 749 BUG_ON(!closure_wait(&w->wait, cl));
739 750
740 closure_flush(&c->journal.io); 751 closure_flush(&c->journal.io);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 8589512c972e..1a3b4f4786c3 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -9,6 +9,8 @@
9#include "debug.h" 9#include "debug.h"
10#include "request.h" 10#include "request.h"
11 11
12#include <trace/events/bcache.h>
13
12struct moving_io { 14struct moving_io {
13 struct keybuf_key *w; 15 struct keybuf_key *w;
14 struct search s; 16 struct search s;
@@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl)
44{ 46{
45 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 47 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
46 struct bio *bio = &io->bio.bio; 48 struct bio *bio = &io->bio.bio;
47 struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); 49 struct bio_vec *bv;
50 int i;
48 51
49 while (bv-- != bio->bi_io_vec) 52 bio_for_each_segment_all(bv, bio, i)
50 __free_page(bv->bv_page); 53 __free_page(bv->bv_page);
51 54
52 pr_debug("%s %s", io->s.op.insert_collision 55 if (io->s.op.insert_collision)
53 ? "collision moving" : "moved", 56 trace_bcache_gc_copy_collision(&io->w->key);
54 pkey(&io->w->key));
55 57
56 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); 58 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
57 59
@@ -94,8 +96,6 @@ static void write_moving(struct closure *cl)
94 struct moving_io *io = container_of(s, struct moving_io, s); 96 struct moving_io *io = container_of(s, struct moving_io, s);
95 97
96 if (!s->error) { 98 if (!s->error) {
97 trace_bcache_write_moving(&io->bio.bio);
98
99 moving_init(io); 99 moving_init(io);
100 100
101 io->bio.bio.bi_sector = KEY_START(&io->w->key); 101 io->bio.bio.bi_sector = KEY_START(&io->w->key);
@@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl)
122 struct moving_io *io = container_of(s, struct moving_io, s); 122 struct moving_io *io = container_of(s, struct moving_io, s);
123 struct bio *bio = &io->bio.bio; 123 struct bio *bio = &io->bio.bio;
124 124
125 trace_bcache_read_moving(bio);
126 bch_submit_bbio(bio, s->op.c, &io->w->key, 0); 125 bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
127 126
128 continue_at(cl, write_moving, bch_gc_wq); 127 continue_at(cl, write_moving, bch_gc_wq);
@@ -138,7 +137,8 @@ static void read_moving(struct closure *cl)
138 /* XXX: if we error, background writeback could stall indefinitely */ 137 /* XXX: if we error, background writeback could stall indefinitely */
139 138
140 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { 139 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
141 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); 140 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
141 &MAX_KEY, moving_pred);
142 if (!w) 142 if (!w)
143 break; 143 break;
144 144
@@ -159,10 +159,10 @@ static void read_moving(struct closure *cl)
159 bio->bi_rw = READ; 159 bio->bi_rw = READ;
160 bio->bi_end_io = read_moving_endio; 160 bio->bi_end_io = read_moving_endio;
161 161
162 if (bch_bio_alloc_pages(bio, GFP_KERNEL)) 162 if (bio_alloc_pages(bio, GFP_KERNEL))
163 goto err; 163 goto err;
164 164
165 pr_debug("%s", pkey(&w->key)); 165 trace_bcache_gc_copy(&w->key);
166 166
167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); 167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
168 168
@@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl)
250 250
251void bch_moving_init_cache_set(struct cache_set *c) 251void bch_moving_init_cache_set(struct cache_set *c)
252{ 252{
253 bch_keybuf_init(&c->moving_gc_keys, moving_pred); 253 bch_keybuf_init(&c->moving_gc_keys);
254} 254}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index e5ff12e52d5b..786a1a4f74d8 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -10,6 +10,7 @@
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "request.h" 12#include "request.h"
13#include "writeback.h"
13 14
14#include <linux/cgroup.h> 15#include <linux/cgroup.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -21,8 +22,6 @@
21 22
22#define CUTOFF_CACHE_ADD 95 23#define CUTOFF_CACHE_ADD 95
23#define CUTOFF_CACHE_READA 90 24#define CUTOFF_CACHE_READA 90
24#define CUTOFF_WRITEBACK 50
25#define CUTOFF_WRITEBACK_SYNC 75
26 25
27struct kmem_cache *bch_search_cache; 26struct kmem_cache *bch_search_cache;
28 27
@@ -489,6 +488,12 @@ static void bch_insert_data_loop(struct closure *cl)
489 bch_queue_gc(op->c); 488 bch_queue_gc(op->c);
490 } 489 }
491 490
491 /*
492 * Journal writes are marked REQ_FLUSH; if the original write was a
493 * flush, it'll wait on the journal write.
494 */
495 bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
496
492 do { 497 do {
493 unsigned i; 498 unsigned i;
494 struct bkey *k; 499 struct bkey *k;
@@ -510,10 +515,6 @@ static void bch_insert_data_loop(struct closure *cl)
510 goto err; 515 goto err;
511 516
512 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); 517 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
513 if (!n) {
514 __bkey_put(op->c, k);
515 continue_at(cl, bch_insert_data_loop, bcache_wq);
516 }
517 518
518 n->bi_end_io = bch_insert_data_endio; 519 n->bi_end_io = bch_insert_data_endio;
519 n->bi_private = cl; 520 n->bi_private = cl;
@@ -530,10 +531,9 @@ static void bch_insert_data_loop(struct closure *cl)
530 if (KEY_CSUM(k)) 531 if (KEY_CSUM(k))
531 bio_csum(n, k); 532 bio_csum(n, k);
532 533
533 pr_debug("%s", pkey(k)); 534 trace_bcache_cache_insert(k);
534 bch_keylist_push(&op->keys); 535 bch_keylist_push(&op->keys);
535 536
536 trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
537 n->bi_rw |= REQ_WRITE; 537 n->bi_rw |= REQ_WRITE;
538 bch_submit_bbio(n, op->c, k, 0); 538 bch_submit_bbio(n, op->c, k, 0);
539 } while (n != bio); 539 } while (n != bio);
@@ -716,7 +716,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
716 s->task = current; 716 s->task = current;
717 s->orig_bio = bio; 717 s->orig_bio = bio;
718 s->write = (bio->bi_rw & REQ_WRITE) != 0; 718 s->write = (bio->bi_rw & REQ_WRITE) != 0;
719 s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; 719 s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
720 s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; 720 s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
721 s->recoverable = 1; 721 s->recoverable = 1;
722 s->start_time = jiffies; 722 s->start_time = jiffies;
@@ -784,11 +784,8 @@ static void request_read_error(struct closure *cl)
784 int i; 784 int i;
785 785
786 if (s->recoverable) { 786 if (s->recoverable) {
787 /* The cache read failed, but we can retry from the backing 787 /* Retry from the backing device: */
788 * device. 788 trace_bcache_read_retry(s->orig_bio);
789 */
790 pr_debug("recovering at sector %llu",
791 (uint64_t) s->orig_bio->bi_sector);
792 789
793 s->error = 0; 790 s->error = 0;
794 bv = s->bio.bio.bi_io_vec; 791 bv = s->bio.bio.bi_io_vec;
@@ -806,7 +803,6 @@ static void request_read_error(struct closure *cl)
806 803
807 /* XXX: invalidate cache */ 804 /* XXX: invalidate cache */
808 805
809 trace_bcache_read_retry(&s->bio.bio);
810 closure_bio_submit(&s->bio.bio, &s->cl, s->d); 806 closure_bio_submit(&s->bio.bio, &s->cl, s->d);
811 } 807 }
812 808
@@ -827,53 +823,13 @@ static void request_read_done(struct closure *cl)
827 */ 823 */
828 824
829 if (s->op.cache_bio) { 825 if (s->op.cache_bio) {
830 struct bio_vec *src, *dst;
831 unsigned src_offset, dst_offset, bytes;
832 void *dst_ptr;
833
834 bio_reset(s->op.cache_bio); 826 bio_reset(s->op.cache_bio);
835 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; 827 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
836 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; 828 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
837 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 829 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
838 bch_bio_map(s->op.cache_bio, NULL); 830 bch_bio_map(s->op.cache_bio, NULL);
839 831
840 src = bio_iovec(s->op.cache_bio); 832 bio_copy_data(s->cache_miss, s->op.cache_bio);
841 dst = bio_iovec(s->cache_miss);
842 src_offset = src->bv_offset;
843 dst_offset = dst->bv_offset;
844 dst_ptr = kmap(dst->bv_page);
845
846 while (1) {
847 if (dst_offset == dst->bv_offset + dst->bv_len) {
848 kunmap(dst->bv_page);
849 dst++;
850 if (dst == bio_iovec_idx(s->cache_miss,
851 s->cache_miss->bi_vcnt))
852 break;
853
854 dst_offset = dst->bv_offset;
855 dst_ptr = kmap(dst->bv_page);
856 }
857
858 if (src_offset == src->bv_offset + src->bv_len) {
859 src++;
860 if (src == bio_iovec_idx(s->op.cache_bio,
861 s->op.cache_bio->bi_vcnt))
862 BUG();
863
864 src_offset = src->bv_offset;
865 }
866
867 bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
868 src->bv_offset + src->bv_len - src_offset);
869
870 memcpy(dst_ptr + dst_offset,
871 page_address(src->bv_page) + src_offset,
872 bytes);
873
874 src_offset += bytes;
875 dst_offset += bytes;
876 }
877 833
878 bio_put(s->cache_miss); 834 bio_put(s->cache_miss);
879 s->cache_miss = NULL; 835 s->cache_miss = NULL;
@@ -899,6 +855,7 @@ static void request_read_done_bh(struct closure *cl)
899 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 855 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
900 856
901 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); 857 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
858 trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
902 859
903 if (s->error) 860 if (s->error)
904 continue_at_nobarrier(cl, request_read_error, bcache_wq); 861 continue_at_nobarrier(cl, request_read_error, bcache_wq);
@@ -917,9 +874,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
917 struct bio *miss; 874 struct bio *miss;
918 875
919 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 876 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
920 if (!miss)
921 return -EAGAIN;
922
923 if (miss == bio) 877 if (miss == bio)
924 s->op.lookup_done = true; 878 s->op.lookup_done = true;
925 879
@@ -938,8 +892,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
938 reada = min(dc->readahead >> 9, 892 reada = min(dc->readahead >> 9,
939 sectors - bio_sectors(miss)); 893 sectors - bio_sectors(miss));
940 894
941 if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) 895 if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
942 reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); 896 reada = bdev_sectors(miss->bi_bdev) -
897 bio_end_sector(miss);
943 } 898 }
944 899
945 s->cache_bio_sectors = bio_sectors(miss) + reada; 900 s->cache_bio_sectors = bio_sectors(miss) + reada;
@@ -963,13 +918,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
963 goto out_put; 918 goto out_put;
964 919
965 bch_bio_map(s->op.cache_bio, NULL); 920 bch_bio_map(s->op.cache_bio, NULL);
966 if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) 921 if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
967 goto out_put; 922 goto out_put;
968 923
969 s->cache_miss = miss; 924 s->cache_miss = miss;
970 bio_get(s->op.cache_bio); 925 bio_get(s->op.cache_bio);
971 926
972 trace_bcache_cache_miss(s->orig_bio);
973 closure_bio_submit(s->op.cache_bio, &s->cl, s->d); 927 closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
974 928
975 return ret; 929 return ret;
@@ -1002,24 +956,13 @@ static void cached_dev_write_complete(struct closure *cl)
1002 cached_dev_bio_complete(cl); 956 cached_dev_bio_complete(cl);
1003} 957}
1004 958
1005static bool should_writeback(struct cached_dev *dc, struct bio *bio)
1006{
1007 unsigned threshold = (bio->bi_rw & REQ_SYNC)
1008 ? CUTOFF_WRITEBACK_SYNC
1009 : CUTOFF_WRITEBACK;
1010
1011 return !atomic_read(&dc->disk.detaching) &&
1012 cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
1013 dc->disk.c->gc_stats.in_use < threshold;
1014}
1015
1016static void request_write(struct cached_dev *dc, struct search *s) 959static void request_write(struct cached_dev *dc, struct search *s)
1017{ 960{
1018 struct closure *cl = &s->cl; 961 struct closure *cl = &s->cl;
1019 struct bio *bio = &s->bio.bio; 962 struct bio *bio = &s->bio.bio;
1020 struct bkey start, end; 963 struct bkey start, end;
1021 start = KEY(dc->disk.id, bio->bi_sector, 0); 964 start = KEY(dc->disk.id, bio->bi_sector, 0);
1022 end = KEY(dc->disk.id, bio_end(bio), 0); 965 end = KEY(dc->disk.id, bio_end_sector(bio), 0);
1023 966
1024 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); 967 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
1025 968
@@ -1034,22 +977,37 @@ static void request_write(struct cached_dev *dc, struct search *s)
1034 if (bio->bi_rw & REQ_DISCARD) 977 if (bio->bi_rw & REQ_DISCARD)
1035 goto skip; 978 goto skip;
1036 979
980 if (should_writeback(dc, s->orig_bio,
981 cache_mode(dc, bio),
982 s->op.skip)) {
983 s->op.skip = false;
984 s->writeback = true;
985 }
986
1037 if (s->op.skip) 987 if (s->op.skip)
1038 goto skip; 988 goto skip;
1039 989
1040 if (should_writeback(dc, s->orig_bio)) 990 trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
1041 s->writeback = true;
1042 991
1043 if (!s->writeback) { 992 if (!s->writeback) {
1044 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 993 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1045 dc->disk.bio_split); 994 dc->disk.bio_split);
1046 995
1047 trace_bcache_writethrough(s->orig_bio);
1048 closure_bio_submit(bio, cl, s->d); 996 closure_bio_submit(bio, cl, s->d);
1049 } else { 997 } else {
1050 s->op.cache_bio = bio; 998 bch_writeback_add(dc);
1051 trace_bcache_writeback(s->orig_bio); 999
1052 bch_writeback_add(dc, bio_sectors(bio)); 1000 if (s->op.flush_journal) {
1001 /* Also need to send a flush to the backing device */
1002 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1003 dc->disk.bio_split);
1004
1005 bio->bi_size = 0;
1006 bio->bi_vcnt = 0;
1007 closure_bio_submit(bio, cl, s->d);
1008 } else {
1009 s->op.cache_bio = bio;
1010 }
1053 } 1011 }
1054out: 1012out:
1055 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1013 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
@@ -1058,7 +1016,6 @@ skip:
1058 s->op.skip = true; 1016 s->op.skip = true;
1059 s->op.cache_bio = s->orig_bio; 1017 s->op.cache_bio = s->orig_bio;
1060 bio_get(s->op.cache_bio); 1018 bio_get(s->op.cache_bio);
1061 trace_bcache_write_skip(s->orig_bio);
1062 1019
1063 if ((bio->bi_rw & REQ_DISCARD) && 1020 if ((bio->bi_rw & REQ_DISCARD) &&
1064 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1021 !blk_queue_discard(bdev_get_queue(dc->bdev)))
@@ -1088,9 +1045,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
1088 1045
1089/* Cached devices - read & write stuff */ 1046/* Cached devices - read & write stuff */
1090 1047
1091int bch_get_congested(struct cache_set *c) 1048unsigned bch_get_congested(struct cache_set *c)
1092{ 1049{
1093 int i; 1050 int i;
1051 long rand;
1094 1052
1095 if (!c->congested_read_threshold_us && 1053 if (!c->congested_read_threshold_us &&
1096 !c->congested_write_threshold_us) 1054 !c->congested_write_threshold_us)
@@ -1106,7 +1064,13 @@ int bch_get_congested(struct cache_set *c)
1106 1064
1107 i += CONGESTED_MAX; 1065 i += CONGESTED_MAX;
1108 1066
1109 return i <= 0 ? 1 : fract_exp_two(i, 6); 1067 if (i > 0)
1068 i = fract_exp_two(i, 6);
1069
1070 rand = get_random_int();
1071 i -= bitmap_weight(&rand, BITS_PER_LONG);
1072
1073 return i > 0 ? i : 1;
1110} 1074}
1111 1075
1112static void add_sequential(struct task_struct *t) 1076static void add_sequential(struct task_struct *t)
@@ -1126,10 +1090,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
1126{ 1090{
1127 struct cache_set *c = s->op.c; 1091 struct cache_set *c = s->op.c;
1128 struct bio *bio = &s->bio.bio; 1092 struct bio *bio = &s->bio.bio;
1129
1130 long rand;
1131 int cutoff = bch_get_congested(c);
1132 unsigned mode = cache_mode(dc, bio); 1093 unsigned mode = cache_mode(dc, bio);
1094 unsigned sectors, congested = bch_get_congested(c);
1133 1095
1134 if (atomic_read(&dc->disk.detaching) || 1096 if (atomic_read(&dc->disk.detaching) ||
1135 c->gc_stats.in_use > CUTOFF_CACHE_ADD || 1097 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -1147,17 +1109,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
1147 goto skip; 1109 goto skip;
1148 } 1110 }
1149 1111
1150 if (!cutoff) { 1112 if (!congested && !dc->sequential_cutoff)
1151 cutoff = dc->sequential_cutoff >> 9; 1113 goto rescale;
1152 1114
1153 if (!cutoff) 1115 if (!congested &&
1154 goto rescale; 1116 mode == CACHE_MODE_WRITEBACK &&
1155 1117 (bio->bi_rw & REQ_WRITE) &&
1156 if (mode == CACHE_MODE_WRITEBACK && 1118 (bio->bi_rw & REQ_SYNC))
1157 (bio->bi_rw & REQ_WRITE) && 1119 goto rescale;
1158 (bio->bi_rw & REQ_SYNC))
1159 goto rescale;
1160 }
1161 1120
1162 if (dc->sequential_merge) { 1121 if (dc->sequential_merge) {
1163 struct io *i; 1122 struct io *i;
@@ -1177,7 +1136,7 @@ found:
1177 if (i->sequential + bio->bi_size > i->sequential) 1136 if (i->sequential + bio->bi_size > i->sequential)
1178 i->sequential += bio->bi_size; 1137 i->sequential += bio->bi_size;
1179 1138
1180 i->last = bio_end(bio); 1139 i->last = bio_end_sector(bio);
1181 i->jiffies = jiffies + msecs_to_jiffies(5000); 1140 i->jiffies = jiffies + msecs_to_jiffies(5000);
1182 s->task->sequential_io = i->sequential; 1141 s->task->sequential_io = i->sequential;
1183 1142
@@ -1192,12 +1151,19 @@ found:
1192 add_sequential(s->task); 1151 add_sequential(s->task);
1193 } 1152 }
1194 1153
1195 rand = get_random_int(); 1154 sectors = max(s->task->sequential_io,
1196 cutoff -= bitmap_weight(&rand, BITS_PER_LONG); 1155 s->task->sequential_io_avg) >> 9;
1197 1156
1198 if (cutoff <= (int) (max(s->task->sequential_io, 1157 if (dc->sequential_cutoff &&
1199 s->task->sequential_io_avg) >> 9)) 1158 sectors >= dc->sequential_cutoff >> 9) {
1159 trace_bcache_bypass_sequential(s->orig_bio);
1200 goto skip; 1160 goto skip;
1161 }
1162
1163 if (congested && sectors >= congested) {
1164 trace_bcache_bypass_congested(s->orig_bio);
1165 goto skip;
1166 }
1201 1167
1202rescale: 1168rescale:
1203 bch_rescale_priorities(c, bio_sectors(bio)); 1169 bch_rescale_priorities(c, bio_sectors(bio));
@@ -1288,30 +1254,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
1288static int flash_dev_cache_miss(struct btree *b, struct search *s, 1254static int flash_dev_cache_miss(struct btree *b, struct search *s,
1289 struct bio *bio, unsigned sectors) 1255 struct bio *bio, unsigned sectors)
1290{ 1256{
1257 struct bio_vec *bv;
1258 int i;
1259
1291 /* Zero fill bio */ 1260 /* Zero fill bio */
1292 1261
1293 while (bio->bi_idx != bio->bi_vcnt) { 1262 bio_for_each_segment(bv, bio, i) {
1294 struct bio_vec *bv = bio_iovec(bio);
1295 unsigned j = min(bv->bv_len >> 9, sectors); 1263 unsigned j = min(bv->bv_len >> 9, sectors);
1296 1264
1297 void *p = kmap(bv->bv_page); 1265 void *p = kmap(bv->bv_page);
1298 memset(p + bv->bv_offset, 0, j << 9); 1266 memset(p + bv->bv_offset, 0, j << 9);
1299 kunmap(bv->bv_page); 1267 kunmap(bv->bv_page);
1300 1268
1301 bv->bv_len -= j << 9; 1269 sectors -= j;
1302 bv->bv_offset += j << 9;
1303
1304 if (bv->bv_len)
1305 return 0;
1306
1307 bio->bi_sector += j;
1308 bio->bi_size -= j << 9;
1309
1310 bio->bi_idx++;
1311 sectors -= j;
1312 } 1270 }
1313 1271
1314 s->op.lookup_done = true; 1272 bio_advance(bio, min(sectors << 9, bio->bi_size));
1273
1274 if (!bio->bi_size)
1275 s->op.lookup_done = true;
1315 1276
1316 return 0; 1277 return 0;
1317} 1278}
@@ -1338,8 +1299,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1338 closure_call(&s->op.cl, btree_read_async, NULL, cl); 1299 closure_call(&s->op.cl, btree_read_async, NULL, cl);
1339 } else if (bio_has_data(bio) || s->op.skip) { 1300 } else if (bio_has_data(bio) || s->op.skip) {
1340 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, 1301 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
1341 &KEY(d->id, bio->bi_sector, 0), 1302 &KEY(d->id, bio->bi_sector, 0),
1342 &KEY(d->id, bio_end(bio), 0)); 1303 &KEY(d->id, bio_end_sector(bio), 0));
1343 1304
1344 s->writeback = true; 1305 s->writeback = true;
1345 s->op.cache_bio = bio; 1306 s->op.cache_bio = bio;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 254d9ab5707c..57dc4784f4f4 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -30,7 +30,7 @@ struct search {
30}; 30};
31 31
32void bch_cache_read_endio(struct bio *, int); 32void bch_cache_read_endio(struct bio *, int);
33int bch_get_congested(struct cache_set *); 33unsigned bch_get_congested(struct cache_set *);
34void bch_insert_data(struct closure *cl); 34void bch_insert_data(struct closure *cl);
35void bch_btree_insert_async(struct closure *); 35void bch_btree_insert_async(struct closure *);
36void bch_cache_read_endio(struct bio *, int); 36void bch_cache_read_endio(struct bio *, int);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f88e2b653a3f..547c4c57b052 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -10,10 +10,13 @@
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "request.h" 12#include "request.h"
13#include "writeback.h"
13 14
15#include <linux/blkdev.h>
14#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
15#include <linux/debugfs.h> 17#include <linux/debugfs.h>
16#include <linux/genhd.h> 18#include <linux/genhd.h>
19#include <linux/kthread.h>
17#include <linux/module.h> 20#include <linux/module.h>
18#include <linux/random.h> 21#include <linux/random.h>
19#include <linux/reboot.h> 22#include <linux/reboot.h>
@@ -342,6 +345,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
342 struct closure *cl = &c->uuid_write.cl; 345 struct closure *cl = &c->uuid_write.cl;
343 struct uuid_entry *u; 346 struct uuid_entry *u;
344 unsigned i; 347 unsigned i;
348 char buf[80];
345 349
346 BUG_ON(!parent); 350 BUG_ON(!parent);
347 closure_lock(&c->uuid_write, parent); 351 closure_lock(&c->uuid_write, parent);
@@ -362,8 +366,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
362 break; 366 break;
363 } 367 }
364 368
365 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", 369 bch_bkey_to_text(buf, sizeof(buf), k);
366 pkey(&c->uuid_bucket)); 370 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
367 371
368 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) 372 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
369 if (!bch_is_zero(u->uuid, 16)) 373 if (!bch_is_zero(u->uuid, 16))
@@ -543,7 +547,6 @@ void bch_prio_write(struct cache *ca)
543 547
544 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), 548 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
545 fifo_used(&ca->free_inc), fifo_used(&ca->unused)); 549 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
546 blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
547 550
548 for (i = prio_buckets(ca) - 1; i >= 0; --i) { 551 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
549 long bucket; 552 long bucket;
@@ -704,7 +707,8 @@ static void bcache_device_detach(struct bcache_device *d)
704 atomic_set(&d->detaching, 0); 707 atomic_set(&d->detaching, 0);
705 } 708 }
706 709
707 bcache_device_unlink(d); 710 if (!d->flush_done)
711 bcache_device_unlink(d);
708 712
709 d->c->devices[d->id] = NULL; 713 d->c->devices[d->id] = NULL;
710 closure_put(&d->c->caching); 714 closure_put(&d->c->caching);
@@ -743,13 +747,35 @@ static void bcache_device_free(struct bcache_device *d)
743 mempool_destroy(d->unaligned_bvec); 747 mempool_destroy(d->unaligned_bvec);
744 if (d->bio_split) 748 if (d->bio_split)
745 bioset_free(d->bio_split); 749 bioset_free(d->bio_split);
750 if (is_vmalloc_addr(d->stripe_sectors_dirty))
751 vfree(d->stripe_sectors_dirty);
752 else
753 kfree(d->stripe_sectors_dirty);
746 754
747 closure_debug_destroy(&d->cl); 755 closure_debug_destroy(&d->cl);
748} 756}
749 757
750static int bcache_device_init(struct bcache_device *d, unsigned block_size) 758static int bcache_device_init(struct bcache_device *d, unsigned block_size,
759 sector_t sectors)
751{ 760{
752 struct request_queue *q; 761 struct request_queue *q;
762 size_t n;
763
764 if (!d->stripe_size_bits)
765 d->stripe_size_bits = 31;
766
767 d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
768 d->stripe_size_bits;
769
770 if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
771 return -ENOMEM;
772
773 n = d->nr_stripes * sizeof(atomic_t);
774 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
775 ? kzalloc(n, GFP_KERNEL)
776 : vzalloc(n);
777 if (!d->stripe_sectors_dirty)
778 return -ENOMEM;
753 779
754 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 780 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
755 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, 781 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
@@ -759,6 +785,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
759 !(q = blk_alloc_queue(GFP_KERNEL))) 785 !(q = blk_alloc_queue(GFP_KERNEL)))
760 return -ENOMEM; 786 return -ENOMEM;
761 787
788 set_capacity(d->disk, sectors);
762 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); 789 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
763 790
764 d->disk->major = bcache_major; 791 d->disk->major = bcache_major;
@@ -781,6 +808,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
781 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); 808 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
782 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); 809 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
783 810
811 blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
812
784 return 0; 813 return 0;
785} 814}
786 815
@@ -800,6 +829,17 @@ static void calc_cached_dev_sectors(struct cache_set *c)
800void bch_cached_dev_run(struct cached_dev *dc) 829void bch_cached_dev_run(struct cached_dev *dc)
801{ 830{
802 struct bcache_device *d = &dc->disk; 831 struct bcache_device *d = &dc->disk;
832 char buf[SB_LABEL_SIZE + 1];
833 char *env[] = {
834 "DRIVER=bcache",
835 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
836 NULL,
837 NULL,
838 };
839
840 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
841 buf[SB_LABEL_SIZE] = '\0';
842 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
803 843
804 if (atomic_xchg(&dc->running, 1)) 844 if (atomic_xchg(&dc->running, 1))
805 return; 845 return;
@@ -816,10 +856,12 @@ void bch_cached_dev_run(struct cached_dev *dc)
816 856
817 add_disk(d->disk); 857 add_disk(d->disk);
818 bd_link_disk_holder(dc->bdev, dc->disk.disk); 858 bd_link_disk_holder(dc->bdev, dc->disk.disk);
819#if 0 859 /* won't show up in the uevent file, use udevadm monitor -e instead
820 char *env[] = { "SYMLINK=label" , NULL }; 860 * only class / kset properties are persistent */
821 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); 861 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
822#endif 862 kfree(env[1]);
863 kfree(env[2]);
864
823 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || 865 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
824 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) 866 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
825 pr_debug("error creating sysfs link"); 867 pr_debug("error creating sysfs link");
@@ -960,6 +1002,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
960 atomic_set(&dc->count, 1); 1002 atomic_set(&dc->count, 1);
961 1003
962 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { 1004 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1005 bch_sectors_dirty_init(dc);
963 atomic_set(&dc->has_dirty, 1); 1006 atomic_set(&dc->has_dirty, 1);
964 atomic_inc(&dc->count); 1007 atomic_inc(&dc->count);
965 bch_writeback_queue(dc); 1008 bch_writeback_queue(dc);
@@ -1014,6 +1057,14 @@ static void cached_dev_flush(struct closure *cl)
1014 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); 1057 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1015 struct bcache_device *d = &dc->disk; 1058 struct bcache_device *d = &dc->disk;
1016 1059
1060 mutex_lock(&bch_register_lock);
1061 d->flush_done = 1;
1062
1063 if (d->c)
1064 bcache_device_unlink(d);
1065
1066 mutex_unlock(&bch_register_lock);
1067
1017 bch_cache_accounting_destroy(&dc->accounting); 1068 bch_cache_accounting_destroy(&dc->accounting);
1018 kobject_del(&d->kobj); 1069 kobject_del(&d->kobj);
1019 1070
@@ -1045,7 +1096,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1045 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); 1096 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1046 } 1097 }
1047 1098
1048 ret = bcache_device_init(&dc->disk, block_size); 1099 ret = bcache_device_init(&dc->disk, block_size,
1100 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1049 if (ret) 1101 if (ret)
1050 return ret; 1102 return ret;
1051 1103
@@ -1144,11 +1196,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1144 1196
1145 kobject_init(&d->kobj, &bch_flash_dev_ktype); 1197 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1146 1198
1147 if (bcache_device_init(d, block_bytes(c))) 1199 if (bcache_device_init(d, block_bytes(c), u->sectors))
1148 goto err; 1200 goto err;
1149 1201
1150 bcache_device_attach(d, c, u - c->uuids); 1202 bcache_device_attach(d, c, u - c->uuids);
1151 set_capacity(d->disk, u->sectors);
1152 bch_flash_dev_request_init(d); 1203 bch_flash_dev_request_init(d);
1153 add_disk(d->disk); 1204 add_disk(d->disk);
1154 1205
@@ -1255,9 +1306,10 @@ static void cache_set_free(struct closure *cl)
1255 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1306 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1256 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); 1307 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1257 1308
1258 kfree(c->fill_iter);
1259 if (c->bio_split) 1309 if (c->bio_split)
1260 bioset_free(c->bio_split); 1310 bioset_free(c->bio_split);
1311 if (c->fill_iter)
1312 mempool_destroy(c->fill_iter);
1261 if (c->bio_meta) 1313 if (c->bio_meta)
1262 mempool_destroy(c->bio_meta); 1314 mempool_destroy(c->bio_meta);
1263 if (c->search) 1315 if (c->search)
@@ -1278,11 +1330,9 @@ static void cache_set_free(struct closure *cl)
1278static void cache_set_flush(struct closure *cl) 1330static void cache_set_flush(struct closure *cl)
1279{ 1331{
1280 struct cache_set *c = container_of(cl, struct cache_set, caching); 1332 struct cache_set *c = container_of(cl, struct cache_set, caching);
1333 struct cache *ca;
1281 struct btree *b; 1334 struct btree *b;
1282 1335 unsigned i;
1283 /* Shut down allocator threads */
1284 set_bit(CACHE_SET_STOPPING_2, &c->flags);
1285 wake_up(&c->alloc_wait);
1286 1336
1287 bch_cache_accounting_destroy(&c->accounting); 1337 bch_cache_accounting_destroy(&c->accounting);
1288 1338
@@ -1295,7 +1345,11 @@ static void cache_set_flush(struct closure *cl)
1295 /* Should skip this if we're unregistering because of an error */ 1345 /* Should skip this if we're unregistering because of an error */
1296 list_for_each_entry(b, &c->btree_cache, list) 1346 list_for_each_entry(b, &c->btree_cache, list)
1297 if (btree_node_dirty(b)) 1347 if (btree_node_dirty(b))
1298 bch_btree_write(b, true, NULL); 1348 bch_btree_node_write(b, NULL);
1349
1350 for_each_cache(ca, c, i)
1351 if (ca->alloc_thread)
1352 kthread_stop(ca->alloc_thread);
1299 1353
1300 closure_return(cl); 1354 closure_return(cl);
1301} 1355}
@@ -1303,18 +1357,22 @@ static void cache_set_flush(struct closure *cl)
1303static void __cache_set_unregister(struct closure *cl) 1357static void __cache_set_unregister(struct closure *cl)
1304{ 1358{
1305 struct cache_set *c = container_of(cl, struct cache_set, caching); 1359 struct cache_set *c = container_of(cl, struct cache_set, caching);
1306 struct cached_dev *dc, *t; 1360 struct cached_dev *dc;
1307 size_t i; 1361 size_t i;
1308 1362
1309 mutex_lock(&bch_register_lock); 1363 mutex_lock(&bch_register_lock);
1310 1364
1311 if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
1312 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1313 bch_cached_dev_detach(dc);
1314
1315 for (i = 0; i < c->nr_uuids; i++) 1365 for (i = 0; i < c->nr_uuids; i++)
1316 if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) 1366 if (c->devices[i]) {
1317 bcache_device_stop(c->devices[i]); 1367 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1368 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1369 dc = container_of(c->devices[i],
1370 struct cached_dev, disk);
1371 bch_cached_dev_detach(dc);
1372 } else {
1373 bcache_device_stop(c->devices[i]);
1374 }
1375 }
1318 1376
1319 mutex_unlock(&bch_register_lock); 1377 mutex_unlock(&bch_register_lock);
1320 1378
@@ -1373,9 +1431,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1373 c->btree_pages = max_t(int, c->btree_pages / 4, 1431 c->btree_pages = max_t(int, c->btree_pages / 4,
1374 BTREE_MAX_PAGES); 1432 BTREE_MAX_PAGES);
1375 1433
1376 init_waitqueue_head(&c->alloc_wait); 1434 c->sort_crit_factor = int_sqrt(c->btree_pages);
1435
1377 mutex_init(&c->bucket_lock); 1436 mutex_init(&c->bucket_lock);
1378 mutex_init(&c->fill_lock);
1379 mutex_init(&c->sort_lock); 1437 mutex_init(&c->sort_lock);
1380 spin_lock_init(&c->sort_time_lock); 1438 spin_lock_init(&c->sort_time_lock);
1381 closure_init_unlocked(&c->sb_write); 1439 closure_init_unlocked(&c->sb_write);
@@ -1401,8 +1459,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1401 !(c->bio_meta = mempool_create_kmalloc_pool(2, 1459 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1402 sizeof(struct bbio) + sizeof(struct bio_vec) * 1460 sizeof(struct bbio) + sizeof(struct bio_vec) *
1403 bucket_pages(c))) || 1461 bucket_pages(c))) ||
1462 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1404 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1463 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1405 !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
1406 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || 1464 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1407 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1465 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1408 bch_journal_alloc(c) || 1466 bch_journal_alloc(c) ||
@@ -1410,8 +1468,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1410 bch_open_buckets_alloc(c)) 1468 bch_open_buckets_alloc(c))
1411 goto err; 1469 goto err;
1412 1470
1413 c->fill_iter->size = sb->bucket_size / sb->block_size;
1414
1415 c->congested_read_threshold_us = 2000; 1471 c->congested_read_threshold_us = 2000;
1416 c->congested_write_threshold_us = 20000; 1472 c->congested_write_threshold_us = 20000;
1417 c->error_limit = 8 << IO_ERROR_SHIFT; 1473 c->error_limit = 8 << IO_ERROR_SHIFT;
@@ -1496,9 +1552,10 @@ static void run_cache_set(struct cache_set *c)
1496 */ 1552 */
1497 bch_journal_next(&c->journal); 1553 bch_journal_next(&c->journal);
1498 1554
1555 err = "error starting allocator thread";
1499 for_each_cache(ca, c, i) 1556 for_each_cache(ca, c, i)
1500 closure_call(&ca->alloc, bch_allocator_thread, 1557 if (bch_cache_allocator_start(ca))
1501 system_wq, &c->cl); 1558 goto err;
1502 1559
1503 /* 1560 /*
1504 * First place it's safe to allocate: btree_check() and 1561 * First place it's safe to allocate: btree_check() and
@@ -1531,17 +1588,16 @@ static void run_cache_set(struct cache_set *c)
1531 1588
1532 bch_btree_gc_finish(c); 1589 bch_btree_gc_finish(c);
1533 1590
1591 err = "error starting allocator thread";
1534 for_each_cache(ca, c, i) 1592 for_each_cache(ca, c, i)
1535 closure_call(&ca->alloc, bch_allocator_thread, 1593 if (bch_cache_allocator_start(ca))
1536 ca->alloc_workqueue, &c->cl); 1594 goto err;
1537 1595
1538 mutex_lock(&c->bucket_lock); 1596 mutex_lock(&c->bucket_lock);
1539 for_each_cache(ca, c, i) 1597 for_each_cache(ca, c, i)
1540 bch_prio_write(ca); 1598 bch_prio_write(ca);
1541 mutex_unlock(&c->bucket_lock); 1599 mutex_unlock(&c->bucket_lock);
1542 1600
1543 wake_up(&c->alloc_wait);
1544
1545 err = "cannot allocate new UUID bucket"; 1601 err = "cannot allocate new UUID bucket";
1546 if (__uuid_write(c)) 1602 if (__uuid_write(c))
1547 goto err_unlock_gc; 1603 goto err_unlock_gc;
@@ -1552,7 +1608,7 @@ static void run_cache_set(struct cache_set *c)
1552 goto err_unlock_gc; 1608 goto err_unlock_gc;
1553 1609
1554 bkey_copy_key(&c->root->key, &MAX_KEY); 1610 bkey_copy_key(&c->root->key, &MAX_KEY);
1555 bch_btree_write(c->root, true, &op); 1611 bch_btree_node_write(c->root, &op.cl);
1556 1612
1557 bch_btree_set_root(c->root); 1613 bch_btree_set_root(c->root);
1558 rw_unlock(true, c->root); 1614 rw_unlock(true, c->root);
@@ -1673,9 +1729,6 @@ void bch_cache_release(struct kobject *kobj)
1673 1729
1674 bio_split_pool_free(&ca->bio_split_hook); 1730 bio_split_pool_free(&ca->bio_split_hook);
1675 1731
1676 if (ca->alloc_workqueue)
1677 destroy_workqueue(ca->alloc_workqueue);
1678
1679 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); 1732 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1680 kfree(ca->prio_buckets); 1733 kfree(ca->prio_buckets);
1681 vfree(ca->buckets); 1734 vfree(ca->buckets);
@@ -1723,7 +1776,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1723 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 1776 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1724 2, GFP_KERNEL)) || 1777 2, GFP_KERNEL)) ||
1725 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || 1778 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1726 !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
1727 bio_split_pool_init(&ca->bio_split_hook)) 1779 bio_split_pool_init(&ca->bio_split_hook))
1728 return -ENOMEM; 1780 return -ENOMEM;
1729 1781
@@ -1786,6 +1838,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1786kobj_attribute_write(register, register_bcache); 1838kobj_attribute_write(register, register_bcache);
1787kobj_attribute_write(register_quiet, register_bcache); 1839kobj_attribute_write(register_quiet, register_bcache);
1788 1840
1841static bool bch_is_open_backing(struct block_device *bdev) {
1842 struct cache_set *c, *tc;
1843 struct cached_dev *dc, *t;
1844
1845 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1846 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1847 if (dc->bdev == bdev)
1848 return true;
1849 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1850 if (dc->bdev == bdev)
1851 return true;
1852 return false;
1853}
1854
1855static bool bch_is_open_cache(struct block_device *bdev) {
1856 struct cache_set *c, *tc;
1857 struct cache *ca;
1858 unsigned i;
1859
1860 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1861 for_each_cache(ca, c, i)
1862 if (ca->bdev == bdev)
1863 return true;
1864 return false;
1865}
1866
1867static bool bch_is_open(struct block_device *bdev) {
1868 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1869}
1870
1789static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, 1871static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1790 const char *buffer, size_t size) 1872 const char *buffer, size_t size)
1791{ 1873{
@@ -1810,8 +1892,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1810 FMODE_READ|FMODE_WRITE|FMODE_EXCL, 1892 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1811 sb); 1893 sb);
1812 if (IS_ERR(bdev)) { 1894 if (IS_ERR(bdev)) {
1813 if (bdev == ERR_PTR(-EBUSY)) 1895 if (bdev == ERR_PTR(-EBUSY)) {
1814 err = "device busy"; 1896 bdev = lookup_bdev(strim(path));
1897 if (!IS_ERR(bdev) && bch_is_open(bdev))
1898 err = "device already registered";
1899 else
1900 err = "device busy";
1901 }
1815 goto err; 1902 goto err;
1816 } 1903 }
1817 1904
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4d9cca47e4c6..12a2c2846f99 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -9,7 +9,9 @@
9#include "sysfs.h" 9#include "sysfs.h"
10#include "btree.h" 10#include "btree.h"
11#include "request.h" 11#include "request.h"
12#include "writeback.h"
12 13
14#include <linux/blkdev.h>
13#include <linux/sort.h> 15#include <linux/sort.h>
14 16
15static const char * const cache_replacement_policies[] = { 17static const char * const cache_replacement_policies[] = {
@@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse);
79rw_attribute(writeback_rate_d_smooth); 81rw_attribute(writeback_rate_d_smooth);
80read_attribute(writeback_rate_debug); 82read_attribute(writeback_rate_debug);
81 83
84read_attribute(stripe_size);
85read_attribute(partial_stripes_expensive);
86
82rw_attribute(synchronous); 87rw_attribute(synchronous);
83rw_attribute(journal_delay_ms); 88rw_attribute(journal_delay_ms);
84rw_attribute(discard); 89rw_attribute(discard);
@@ -127,7 +132,7 @@ SHOW(__bch_cached_dev)
127 char derivative[20]; 132 char derivative[20];
128 char target[20]; 133 char target[20];
129 bch_hprint(dirty, 134 bch_hprint(dirty,
130 atomic_long_read(&dc->disk.sectors_dirty) << 9); 135 bcache_dev_sectors_dirty(&dc->disk) << 9);
131 bch_hprint(derivative, dc->writeback_rate_derivative << 9); 136 bch_hprint(derivative, dc->writeback_rate_derivative << 9);
132 bch_hprint(target, dc->writeback_rate_target << 9); 137 bch_hprint(target, dc->writeback_rate_target << 9);
133 138
@@ -143,7 +148,10 @@ SHOW(__bch_cached_dev)
143 } 148 }
144 149
145 sysfs_hprint(dirty_data, 150 sysfs_hprint(dirty_data,
146 atomic_long_read(&dc->disk.sectors_dirty) << 9); 151 bcache_dev_sectors_dirty(&dc->disk) << 9);
152
153 sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9);
154 var_printf(partial_stripes_expensive, "%u");
147 155
148 var_printf(sequential_merge, "%i"); 156 var_printf(sequential_merge, "%i");
149 var_hprint(sequential_cutoff); 157 var_hprint(sequential_cutoff);
@@ -170,6 +178,7 @@ STORE(__cached_dev)
170 disk.kobj); 178 disk.kobj);
171 unsigned v = size; 179 unsigned v = size;
172 struct cache_set *c; 180 struct cache_set *c;
181 struct kobj_uevent_env *env;
173 182
174#define d_strtoul(var) sysfs_strtoul(var, dc->var) 183#define d_strtoul(var) sysfs_strtoul(var, dc->var)
175#define d_strtoi_h(var) sysfs_hatoi(var, dc->var) 184#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
@@ -214,6 +223,7 @@ STORE(__cached_dev)
214 } 223 }
215 224
216 if (attr == &sysfs_label) { 225 if (attr == &sysfs_label) {
226 /* note: endlines are preserved */
217 memcpy(dc->sb.label, buf, SB_LABEL_SIZE); 227 memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
218 bch_write_bdev_super(dc, NULL); 228 bch_write_bdev_super(dc, NULL);
219 if (dc->disk.c) { 229 if (dc->disk.c) {
@@ -221,6 +231,15 @@ STORE(__cached_dev)
221 buf, SB_LABEL_SIZE); 231 buf, SB_LABEL_SIZE);
222 bch_uuid_write(dc->disk.c); 232 bch_uuid_write(dc->disk.c);
223 } 233 }
234 env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
235 if (!env)
236 return -ENOMEM;
237 add_uevent_var(env, "DRIVER=bcache");
238 add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
239 add_uevent_var(env, "CACHED_LABEL=%s", buf);
240 kobject_uevent_env(
241 &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
242 kfree(env);
224 } 243 }
225 244
226 if (attr == &sysfs_attach) { 245 if (attr == &sysfs_attach) {
@@ -284,6 +303,8 @@ static struct attribute *bch_cached_dev_files[] = {
284 &sysfs_writeback_rate_d_smooth, 303 &sysfs_writeback_rate_d_smooth,
285 &sysfs_writeback_rate_debug, 304 &sysfs_writeback_rate_debug,
286 &sysfs_dirty_data, 305 &sysfs_dirty_data,
306 &sysfs_stripe_size,
307 &sysfs_partial_stripes_expensive,
287 &sysfs_sequential_cutoff, 308 &sysfs_sequential_cutoff,
288 &sysfs_sequential_merge, 309 &sysfs_sequential_merge,
289 &sysfs_clear_stats, 310 &sysfs_clear_stats,
@@ -665,12 +686,10 @@ SHOW(__bch_cache)
665 int cmp(const void *l, const void *r) 686 int cmp(const void *l, const void *r)
666 { return *((uint16_t *) r) - *((uint16_t *) l); } 687 { return *((uint16_t *) r) - *((uint16_t *) l); }
667 688
668 /* Number of quantiles we compute */
669 const unsigned nq = 31;
670
671 size_t n = ca->sb.nbuckets, i, unused, btree; 689 size_t n = ca->sb.nbuckets, i, unused, btree;
672 uint64_t sum = 0; 690 uint64_t sum = 0;
673 uint16_t q[nq], *p, *cached; 691 /* Compute 31 quantiles */
692 uint16_t q[31], *p, *cached;
674 ssize_t ret; 693 ssize_t ret;
675 694
676 cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); 695 cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
@@ -703,26 +722,29 @@ SHOW(__bch_cache)
703 if (n) 722 if (n)
704 do_div(sum, n); 723 do_div(sum, n);
705 724
706 for (i = 0; i < nq; i++) 725 for (i = 0; i < ARRAY_SIZE(q); i++)
707 q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; 726 q[i] = INITIAL_PRIO - cached[n * (i + 1) /
727 (ARRAY_SIZE(q) + 1)];
708 728
709 vfree(p); 729 vfree(p);
710 730
711 ret = snprintf(buf, PAGE_SIZE, 731 ret = scnprintf(buf, PAGE_SIZE,
712 "Unused: %zu%%\n" 732 "Unused: %zu%%\n"
713 "Metadata: %zu%%\n" 733 "Metadata: %zu%%\n"
714 "Average: %llu\n" 734 "Average: %llu\n"
715 "Sectors per Q: %zu\n" 735 "Sectors per Q: %zu\n"
716 "Quantiles: [", 736 "Quantiles: [",
717 unused * 100 / (size_t) ca->sb.nbuckets, 737 unused * 100 / (size_t) ca->sb.nbuckets,
718 btree * 100 / (size_t) ca->sb.nbuckets, sum, 738 btree * 100 / (size_t) ca->sb.nbuckets, sum,
719 n * ca->sb.bucket_size / (nq + 1)); 739 n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
720 740
721 for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) 741 for (i = 0; i < ARRAY_SIZE(q); i++)
722 ret += snprintf(buf + ret, PAGE_SIZE - ret, 742 ret += scnprintf(buf + ret, PAGE_SIZE - ret,
723 i < nq - 1 ? "%u " : "%u]\n", q[i]); 743 "%u ", q[i]);
724 744 ret--;
725 buf[PAGE_SIZE - 1] = '\0'; 745
746 ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
747
726 return ret; 748 return ret;
727 } 749 }
728 750
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index 983f9bb411bc..f7b6c197f90f 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -2,6 +2,7 @@
2#include "btree.h" 2#include "btree.h"
3#include "request.h" 3#include "request.h"
4 4
5#include <linux/blktrace_api.h>
5#include <linux/module.h> 6#include <linux/module.h>
6 7
7#define CREATE_TRACE_POINTS 8#define CREATE_TRACE_POINTS
@@ -9,18 +10,44 @@
9 10
10EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); 11EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
11EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); 12EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
12EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); 13
13EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); 14EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
14EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); 15EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
18EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
15EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); 19EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
16EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); 20
17EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); 21EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
18EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); 22
23EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
24EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
25EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
26EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
27
28EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
29
19EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); 30EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
20EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); 31EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
21EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); 32
22EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); 33EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
23EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); 34EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
24EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); 35EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
36
37EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
25EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); 38EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
26EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); 39EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
40EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
41EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
42
43EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
44
45EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
46EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
47EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
48
49EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
50EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
51
52EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
53EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index da3a99e85b1e..98eb81159a22 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
228 } 228 }
229} 229}
230 230
231int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
232{
233 int i;
234 struct bio_vec *bv;
235
236 bio_for_each_segment(bv, bio, i) {
237 bv->bv_page = alloc_page(gfp);
238 if (!bv->bv_page) {
239 while (bv-- != bio->bi_io_vec + bio->bi_idx)
240 __free_page(bv->bv_page);
241 return -ENOMEM;
242 }
243 }
244
245 return 0;
246}
247
248/* 231/*
249 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any 232 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
250 * use permitted, subject to terms of PostgreSQL license; see.) 233 * use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 577393e38c3a..1ae2a73ad85f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -15,8 +15,6 @@
15 15
16struct closure; 16struct closure;
17 17
18#include <trace/events/bcache.h>
19
20#ifdef CONFIG_BCACHE_EDEBUG 18#ifdef CONFIG_BCACHE_EDEBUG
21 19
22#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
@@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
566 return x; 564 return x;
567} 565}
568 566
569#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
570
571void bch_bio_map(struct bio *bio, void *base); 567void bch_bio_map(struct bio *bio, void *base);
572 568
573int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
574
575static inline sector_t bdev_sectors(struct block_device *bdev) 569static inline sector_t bdev_sectors(struct block_device *bdev)
576{ 570{
577 return bdev->bd_inode->i_size >> 9; 571 return bdev->bd_inode->i_size >> 9;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 2714ed3991d1..22cbff551628 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -9,6 +9,9 @@
9#include "bcache.h" 9#include "bcache.h"
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "writeback.h"
13
14#include <trace/events/bcache.h>
12 15
13static struct workqueue_struct *dirty_wq; 16static struct workqueue_struct *dirty_wq;
14 17
@@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
36 39
37 int change = 0; 40 int change = 0;
38 int64_t error; 41 int64_t error;
39 int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); 42 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
40 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 43 int64_t derivative = dirty - dc->disk.sectors_dirty_last;
41 44
42 dc->disk.sectors_dirty_last = dirty; 45 dc->disk.sectors_dirty_last = dirty;
@@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
105 return KEY_DIRTY(k); 108 return KEY_DIRTY(k);
106} 109}
107 110
111static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
112{
113 uint64_t stripe;
114 unsigned nr_sectors = KEY_SIZE(k);
115 struct cached_dev *dc = container_of(buf, struct cached_dev,
116 writeback_keys);
117 unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
118
119 if (!KEY_DIRTY(k))
120 return false;
121
122 stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
123 while (1) {
124 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
125 stripe_size)
126 return false;
127
128 if (nr_sectors <= stripe_size)
129 return true;
130
131 nr_sectors -= stripe_size;
132 stripe++;
133 }
134}
135
108static void dirty_init(struct keybuf_key *w) 136static void dirty_init(struct keybuf_key *w)
109{ 137{
110 struct dirty_io *io = w->private; 138 struct dirty_io *io = w->private;
@@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl)
149 searched_from_start = true; 177 searched_from_start = true;
150 } 178 }
151 179
152 bch_refill_keybuf(dc->disk.c, buf, &end); 180 if (dc->partial_stripes_expensive) {
181 uint64_t i;
182
183 for (i = 0; i < dc->disk.nr_stripes; i++)
184 if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
185 1 << dc->disk.stripe_size_bits)
186 goto full_stripes;
187
188 goto normal_refill;
189full_stripes:
190 bch_refill_keybuf(dc->disk.c, buf, &end,
191 dirty_full_stripe_pred);
192 } else {
193normal_refill:
194 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
195 }
153 196
154 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { 197 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
155 /* Searched the entire btree - delay awhile */ 198 /* Searched the entire btree - delay awhile */
@@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc)
181 } 224 }
182} 225}
183 226
184void bch_writeback_add(struct cached_dev *dc, unsigned sectors) 227void bch_writeback_add(struct cached_dev *dc)
185{ 228{
186 atomic_long_add(sectors, &dc->disk.sectors_dirty);
187
188 if (!atomic_read(&dc->has_dirty) && 229 if (!atomic_read(&dc->has_dirty) &&
189 !atomic_xchg(&dc->has_dirty, 1)) { 230 !atomic_xchg(&dc->has_dirty, 1)) {
190 atomic_inc(&dc->count); 231 atomic_inc(&dc->count);
@@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
203 } 244 }
204} 245}
205 246
247void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
248 uint64_t offset, int nr_sectors)
249{
250 struct bcache_device *d = c->devices[inode];
251 unsigned stripe_size, stripe_offset;
252 uint64_t stripe;
253
254 if (!d)
255 return;
256
257 stripe_size = 1 << d->stripe_size_bits;
258 stripe = offset >> d->stripe_size_bits;
259 stripe_offset = offset & (stripe_size - 1);
260
261 while (nr_sectors) {
262 int s = min_t(unsigned, abs(nr_sectors),
263 stripe_size - stripe_offset);
264
265 if (nr_sectors < 0)
266 s = -s;
267
268 atomic_add(s, d->stripe_sectors_dirty + stripe);
269 nr_sectors -= s;
270 stripe_offset = 0;
271 stripe++;
272 }
273}
274
206/* Background writeback - IO loop */ 275/* Background writeback - IO loop */
207 276
208static void dirty_io_destructor(struct closure *cl) 277static void dirty_io_destructor(struct closure *cl)
@@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl)
216 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 285 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
217 struct keybuf_key *w = io->bio.bi_private; 286 struct keybuf_key *w = io->bio.bi_private;
218 struct cached_dev *dc = io->dc; 287 struct cached_dev *dc = io->dc;
219 struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); 288 struct bio_vec *bv;
289 int i;
220 290
221 while (bv-- != io->bio.bi_io_vec) 291 bio_for_each_segment_all(bv, &io->bio, i)
222 __free_page(bv->bv_page); 292 __free_page(bv->bv_page);
223 293
224 /* This is kind of a dumb way of signalling errors. */ 294 /* This is kind of a dumb way of signalling errors. */
@@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl)
236 for (i = 0; i < KEY_PTRS(&w->key); i++) 306 for (i = 0; i < KEY_PTRS(&w->key); i++)
237 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 307 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
238 308
239 pr_debug("clearing %s", pkey(&w->key));
240 bch_btree_insert(&op, dc->disk.c); 309 bch_btree_insert(&op, dc->disk.c);
241 closure_sync(&op.cl); 310 closure_sync(&op.cl);
242 311
312 if (op.insert_collision)
313 trace_bcache_writeback_collision(&w->key);
314
243 atomic_long_inc(op.insert_collision 315 atomic_long_inc(op.insert_collision
244 ? &dc->disk.c->writeback_keys_failed 316 ? &dc->disk.c->writeback_keys_failed
245 : &dc->disk.c->writeback_keys_done); 317 : &dc->disk.c->writeback_keys_done);
@@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl)
275 io->bio.bi_bdev = io->dc->bdev; 347 io->bio.bi_bdev = io->dc->bdev;
276 io->bio.bi_end_io = dirty_endio; 348 io->bio.bi_end_io = dirty_endio;
277 349
278 trace_bcache_write_dirty(&io->bio);
279 closure_bio_submit(&io->bio, cl, &io->dc->disk); 350 closure_bio_submit(&io->bio, cl, &io->dc->disk);
280 351
281 continue_at(cl, write_dirty_finish, dirty_wq); 352 continue_at(cl, write_dirty_finish, dirty_wq);
@@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl)
296{ 367{
297 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 368 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
298 369
299 trace_bcache_read_dirty(&io->bio);
300 closure_bio_submit(&io->bio, cl, &io->dc->disk); 370 closure_bio_submit(&io->bio, cl, &io->dc->disk);
301 371
302 continue_at(cl, write_dirty, dirty_wq); 372 continue_at(cl, write_dirty, dirty_wq);
@@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl)
349 io->bio.bi_rw = READ; 419 io->bio.bi_rw = READ;
350 io->bio.bi_end_io = read_dirty_endio; 420 io->bio.bi_end_io = read_dirty_endio;
351 421
352 if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) 422 if (bio_alloc_pages(&io->bio, GFP_KERNEL))
353 goto err_free; 423 goto err_free;
354 424
355 pr_debug("%s", pkey(&w->key)); 425 trace_bcache_writeback(&w->key);
356 426
357 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 427 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
358 428
@@ -375,12 +445,49 @@ err:
375 refill_dirty(cl); 445 refill_dirty(cl);
376} 446}
377 447
448/* Init */
449
450static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
451 struct cached_dev *dc)
452{
453 struct bkey *k;
454 struct btree_iter iter;
455
456 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
457 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
458 if (!b->level) {
459 if (KEY_INODE(k) > dc->disk.id)
460 break;
461
462 if (KEY_DIRTY(k))
463 bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
464 KEY_START(k),
465 KEY_SIZE(k));
466 } else {
467 btree(sectors_dirty_init, k, b, op, dc);
468 if (KEY_INODE(k) > dc->disk.id)
469 break;
470
471 cond_resched();
472 }
473
474 return 0;
475}
476
477void bch_sectors_dirty_init(struct cached_dev *dc)
478{
479 struct btree_op op;
480
481 bch_btree_op_init_stack(&op);
482 btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
483}
484
378void bch_cached_dev_writeback_init(struct cached_dev *dc) 485void bch_cached_dev_writeback_init(struct cached_dev *dc)
379{ 486{
380 closure_init_unlocked(&dc->writeback); 487 closure_init_unlocked(&dc->writeback);
381 init_rwsem(&dc->writeback_lock); 488 init_rwsem(&dc->writeback_lock);
382 489
383 bch_keybuf_init(&dc->writeback_keys, dirty_pred); 490 bch_keybuf_init(&dc->writeback_keys);
384 491
385 dc->writeback_metadata = true; 492 dc->writeback_metadata = true;
386 dc->writeback_running = true; 493 dc->writeback_running = true;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
new file mode 100644
index 000000000000..c91f61bb95b6
--- /dev/null
+++ b/drivers/md/bcache/writeback.h
@@ -0,0 +1,64 @@
1#ifndef _BCACHE_WRITEBACK_H
2#define _BCACHE_WRITEBACK_H
3
4#define CUTOFF_WRITEBACK 40
5#define CUTOFF_WRITEBACK_SYNC 70
6
7static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
8{
9 uint64_t i, ret = 0;
10
11 for (i = 0; i < d->nr_stripes; i++)
12 ret += atomic_read(d->stripe_sectors_dirty + i);
13
14 return ret;
15}
16
17static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
18 uint64_t offset,
19 unsigned nr_sectors)
20{
21 uint64_t stripe = offset >> d->stripe_size_bits;
22
23 while (1) {
24 if (atomic_read(d->stripe_sectors_dirty + stripe))
25 return true;
26
27 if (nr_sectors <= 1 << d->stripe_size_bits)
28 return false;
29
30 nr_sectors -= 1 << d->stripe_size_bits;
31 stripe++;
32 }
33}
34
35static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
36 unsigned cache_mode, bool would_skip)
37{
38 unsigned in_use = dc->disk.c->gc_stats.in_use;
39
40 if (cache_mode != CACHE_MODE_WRITEBACK ||
41 atomic_read(&dc->disk.detaching) ||
42 in_use > CUTOFF_WRITEBACK_SYNC)
43 return false;
44
45 if (dc->partial_stripes_expensive &&
46 bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
47 bio_sectors(bio)))
48 return true;
49
50 if (would_skip)
51 return false;
52
53 return bio->bi_rw & REQ_SYNC ||
54 in_use <= CUTOFF_WRITEBACK;
55}
56
57void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
58void bch_writeback_queue(struct cached_dev *);
59void bch_writeback_add(struct cached_dev *);
60
61void bch_sectors_dirty_init(struct cached_dev *dc);
62void bch_cached_dev_writeback_init(struct cached_dev *);
63
64#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 957a719e8c2f..df7b0a06b0ea 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2290,12 +2290,18 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2290 d = r10_bio->devs[1].devnum; 2290 d = r10_bio->devs[1].devnum;
2291 wbio = r10_bio->devs[1].bio; 2291 wbio = r10_bio->devs[1].bio;
2292 wbio2 = r10_bio->devs[1].repl_bio; 2292 wbio2 = r10_bio->devs[1].repl_bio;
2293 /* Need to test wbio2->bi_end_io before we call
2294 * generic_make_request as if the former is NULL,
2295 * the latter is free to free wbio2.
2296 */
2297 if (wbio2 && !wbio2->bi_end_io)
2298 wbio2 = NULL;
2293 if (wbio->bi_end_io) { 2299 if (wbio->bi_end_io) {
2294 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2300 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2295 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); 2301 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2296 generic_make_request(wbio); 2302 generic_make_request(wbio);
2297 } 2303 }
2298 if (wbio2 && wbio2->bi_end_io) { 2304 if (wbio2) {
2299 atomic_inc(&conf->mirrors[d].replacement->nr_pending); 2305 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2300 md_sync_acct(conf->mirrors[d].replacement->bdev, 2306 md_sync_acct(conf->mirrors[d].replacement->bdev,
2301 bio_sectors(wbio2)); 2307 bio_sectors(wbio2));
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2bf094a587cb..78ea44336e75 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3462,6 +3462,7 @@ static void handle_stripe(struct stripe_head *sh)
3462 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3462 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3463 set_bit(STRIPE_SYNCING, &sh->state); 3463 set_bit(STRIPE_SYNCING, &sh->state);
3464 clear_bit(STRIPE_INSYNC, &sh->state); 3464 clear_bit(STRIPE_INSYNC, &sh->state);
3465 clear_bit(STRIPE_REPLACED, &sh->state);
3465 } 3466 }
3466 spin_unlock(&sh->stripe_lock); 3467 spin_unlock(&sh->stripe_lock);
3467 } 3468 }
@@ -3607,19 +3608,23 @@ static void handle_stripe(struct stripe_head *sh)
3607 handle_parity_checks5(conf, sh, &s, disks); 3608 handle_parity_checks5(conf, sh, &s, disks);
3608 } 3609 }
3609 3610
3610 if (s.replacing && s.locked == 0 3611 if ((s.replacing || s.syncing) && s.locked == 0
3611 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3612 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
3613 && !test_bit(STRIPE_REPLACED, &sh->state)) {
3612 /* Write out to replacement devices where possible */ 3614 /* Write out to replacement devices where possible */
3613 for (i = 0; i < conf->raid_disks; i++) 3615 for (i = 0; i < conf->raid_disks; i++)
3614 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3616 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3615 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3617 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
3616 set_bit(R5_WantReplace, &sh->dev[i].flags); 3618 set_bit(R5_WantReplace, &sh->dev[i].flags);
3617 set_bit(R5_LOCKED, &sh->dev[i].flags); 3619 set_bit(R5_LOCKED, &sh->dev[i].flags);
3618 s.locked++; 3620 s.locked++;
3619 } 3621 }
3620 set_bit(STRIPE_INSYNC, &sh->state); 3622 if (s.replacing)
3623 set_bit(STRIPE_INSYNC, &sh->state);
3624 set_bit(STRIPE_REPLACED, &sh->state);
3621 } 3625 }
3622 if ((s.syncing || s.replacing) && s.locked == 0 && 3626 if ((s.syncing || s.replacing) && s.locked == 0 &&
3627 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3623 test_bit(STRIPE_INSYNC, &sh->state)) { 3628 test_bit(STRIPE_INSYNC, &sh->state)) {
3624 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3629 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3625 clear_bit(STRIPE_SYNCING, &sh->state); 3630 clear_bit(STRIPE_SYNCING, &sh->state);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index b0b663b119a8..70c49329ca9a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -306,6 +306,7 @@ enum {
306 STRIPE_SYNC_REQUESTED, 306 STRIPE_SYNC_REQUESTED,
307 STRIPE_SYNCING, 307 STRIPE_SYNCING,
308 STRIPE_INSYNC, 308 STRIPE_INSYNC,
309 STRIPE_REPLACED,
309 STRIPE_PREREAD_ACTIVE, 310 STRIPE_PREREAD_ACTIVE,
310 STRIPE_DELAYED, 311 STRIPE_DELAYED,
311 STRIPE_DEGRADED, 312 STRIPE_DEGRADED,