aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-12-31 11:51:02 -0500
committerJens Axboe <axboe@kernel.dk>2013-12-31 11:51:02 -0500
commitb28bc9b38c52f63f43e3fd875af982f2240a2859 (patch)
tree76cdb7b52b58f5685993cc15ed81d1c903023358 /drivers/md
parent8d30726912cb39c3a3ebde06214d54861f8fdde2 (diff)
parent802eee95bde72fd0cd0f3a5b2098375a487d1eda (diff)
Merge tag 'v3.13-rc6' into for-3.14/core
Needed to bring blk-mq uptodate, since changes have been going in since for-3.14/core was established. Fixup merge issues related to the immutable biovec changes. Signed-off-by: Jens Axboe <axboe@kernel.dk> Conflicts: block/blk-flush.c fs/btrfs/check-integrity.c fs/btrfs/extent_io.c fs/btrfs/scrub.c fs/logfs/dev_bdev.c
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c2
-rw-r--r--drivers/md/bcache/bcache.h12
-rw-r--r--drivers/md/bcache/btree.c27
-rw-r--r--drivers/md/bcache/movinggc.c21
-rw-r--r--drivers/md/bcache/super.c2
-rw-r--r--drivers/md/bcache/sysfs.c50
-rw-r--r--drivers/md/bcache/util.c8
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c53
-rw-r--r--drivers/md/dm-bufio.c5
-rw-r--r--drivers/md/dm-cache-policy-mq.c13
-rw-r--r--drivers/md/dm-cache-target.c2
-rw-r--r--drivers/md/dm-delay.c23
-rw-r--r--drivers/md/dm-snap.c71
-rw-r--r--drivers/md/dm-stats.c1
-rw-r--r--drivers/md/dm-table.c5
-rw-r--r--drivers/md/dm-thin-metadata.c8
-rw-r--r--drivers/md/dm-thin-metadata.h1
-rw-r--r--drivers/md/dm-thin.c66
-rw-r--r--drivers/md/md.c2
-rw-r--r--drivers/md/persistent-data/dm-array.c10
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c6
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h7
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c32
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c8
-rw-r--r--drivers/md/raid5.c13
26 files changed, 308 insertions, 142 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 2b46bf1d7e40..4c9852d92b0a 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -421,9 +421,11 @@ out:
421 421
422 if (watermark <= WATERMARK_METADATA) { 422 if (watermark <= WATERMARK_METADATA) {
423 SET_GC_MARK(b, GC_MARK_METADATA); 423 SET_GC_MARK(b, GC_MARK_METADATA);
424 SET_GC_MOVE(b, 0);
424 b->prio = BTREE_PRIO; 425 b->prio = BTREE_PRIO;
425 } else { 426 } else {
426 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 427 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
428 SET_GC_MOVE(b, 0);
427 b->prio = INITIAL_PRIO; 429 b->prio = INITIAL_PRIO;
428 } 430 }
429 431
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 964353c5329d..dbdbca5a9591 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -197,7 +197,7 @@ struct bucket {
197 uint8_t disk_gen; 197 uint8_t disk_gen;
198 uint8_t last_gc; /* Most out of date gen in the btree */ 198 uint8_t last_gc; /* Most out of date gen in the btree */
199 uint8_t gc_gen; 199 uint8_t gc_gen;
200 uint16_t gc_mark; 200 uint16_t gc_mark; /* Bitfield used by GC. See below for field */
201}; 201};
202 202
203/* 203/*
@@ -209,7 +209,8 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
209#define GC_MARK_RECLAIMABLE 0 209#define GC_MARK_RECLAIMABLE 0
210#define GC_MARK_DIRTY 1 210#define GC_MARK_DIRTY 1
211#define GC_MARK_METADATA 2 211#define GC_MARK_METADATA 2
212BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); 212BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13);
213BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
213 214
214#include "journal.h" 215#include "journal.h"
215#include "stats.h" 216#include "stats.h"
@@ -371,14 +372,14 @@ struct cached_dev {
371 unsigned char writeback_percent; 372 unsigned char writeback_percent;
372 unsigned writeback_delay; 373 unsigned writeback_delay;
373 374
374 int writeback_rate_change;
375 int64_t writeback_rate_derivative;
376 uint64_t writeback_rate_target; 375 uint64_t writeback_rate_target;
376 int64_t writeback_rate_proportional;
377 int64_t writeback_rate_derivative;
378 int64_t writeback_rate_change;
377 379
378 unsigned writeback_rate_update_seconds; 380 unsigned writeback_rate_update_seconds;
379 unsigned writeback_rate_d_term; 381 unsigned writeback_rate_d_term;
380 unsigned writeback_rate_p_term_inverse; 382 unsigned writeback_rate_p_term_inverse;
381 unsigned writeback_rate_d_smooth;
382}; 383};
383 384
384enum alloc_watermarks { 385enum alloc_watermarks {
@@ -444,7 +445,6 @@ struct cache {
444 * call prio_write() to keep gens from wrapping. 445 * call prio_write() to keep gens from wrapping.
445 */ 446 */
446 uint8_t need_save_prio; 447 uint8_t need_save_prio;
447 unsigned gc_move_threshold;
448 448
449 /* 449 /*
450 * If nonzero, we know we aren't going to find any buckets to invalidate 450 * If nonzero, we know we aren't going to find any buckets to invalidate
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index b62f37925374..946ecd3b048b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1561,6 +1561,28 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1561 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 1561 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1562 GC_MARK_METADATA); 1562 GC_MARK_METADATA);
1563 1563
1564 /* don't reclaim buckets to which writeback keys point */
1565 rcu_read_lock();
1566 for (i = 0; i < c->nr_uuids; i++) {
1567 struct bcache_device *d = c->devices[i];
1568 struct cached_dev *dc;
1569 struct keybuf_key *w, *n;
1570 unsigned j;
1571
1572 if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
1573 continue;
1574 dc = container_of(d, struct cached_dev, disk);
1575
1576 spin_lock(&dc->writeback_keys.lock);
1577 rbtree_postorder_for_each_entry_safe(w, n,
1578 &dc->writeback_keys.keys, node)
1579 for (j = 0; j < KEY_PTRS(&w->key); j++)
1580 SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
1581 GC_MARK_DIRTY);
1582 spin_unlock(&dc->writeback_keys.lock);
1583 }
1584 rcu_read_unlock();
1585
1564 for_each_cache(ca, c, i) { 1586 for_each_cache(ca, c, i) {
1565 uint64_t *i; 1587 uint64_t *i;
1566 1588
@@ -1817,7 +1839,8 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1817 if (KEY_START(k) > KEY_START(insert) + sectors_found) 1839 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1818 goto check_failed; 1840 goto check_failed;
1819 1841
1820 if (KEY_PTRS(replace_key) != KEY_PTRS(k)) 1842 if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
1843 KEY_DIRTY(k) != KEY_DIRTY(replace_key))
1821 goto check_failed; 1844 goto check_failed;
1822 1845
1823 /* skip past gen */ 1846 /* skip past gen */
@@ -2217,7 +2240,7 @@ struct btree_insert_op {
2217 struct bkey *replace_key; 2240 struct bkey *replace_key;
2218}; 2241};
2219 2242
2220int btree_insert_fn(struct btree_op *b_op, struct btree *b) 2243static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2221{ 2244{
2222 struct btree_insert_op *op = container_of(b_op, 2245 struct btree_insert_op *op = container_of(b_op,
2223 struct btree_insert_op, op); 2246 struct btree_insert_op, op);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 581f95df8265..052bd24d24b4 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -25,10 +25,9 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
25 unsigned i; 25 unsigned i;
26 26
27 for (i = 0; i < KEY_PTRS(k); i++) { 27 for (i = 0; i < KEY_PTRS(k); i++) {
28 struct cache *ca = PTR_CACHE(c, k, i);
29 struct bucket *g = PTR_BUCKET(c, k, i); 28 struct bucket *g = PTR_BUCKET(c, k, i);
30 29
31 if (GC_SECTORS_USED(g) < ca->gc_move_threshold) 30 if (GC_MOVE(g))
32 return true; 31 return true;
33 } 32 }
34 33
@@ -65,11 +64,16 @@ static void write_moving_finish(struct closure *cl)
65 64
66static void read_moving_endio(struct bio *bio, int error) 65static void read_moving_endio(struct bio *bio, int error)
67{ 66{
67 struct bbio *b = container_of(bio, struct bbio, bio);
68 struct moving_io *io = container_of(bio->bi_private, 68 struct moving_io *io = container_of(bio->bi_private,
69 struct moving_io, cl); 69 struct moving_io, cl);
70 70
71 if (error) 71 if (error)
72 io->op.error = error; 72 io->op.error = error;
73 else if (!KEY_DIRTY(&b->key) &&
74 ptr_stale(io->op.c, &b->key, 0)) {
75 io->op.error = -EINTR;
76 }
73 77
74 bch_bbio_endio(io->op.c, bio, error, "reading data to move"); 78 bch_bbio_endio(io->op.c, bio, error, "reading data to move");
75} 79}
@@ -141,6 +145,11 @@ static void read_moving(struct cache_set *c)
141 if (!w) 145 if (!w)
142 break; 146 break;
143 147
148 if (ptr_stale(c, &w->key, 0)) {
149 bch_keybuf_del(&c->moving_gc_keys, w);
150 continue;
151 }
152
144 io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) 153 io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
145 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 154 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
146 GFP_KERNEL); 155 GFP_KERNEL);
@@ -184,7 +193,8 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r)
184 193
185static unsigned bucket_heap_top(struct cache *ca) 194static unsigned bucket_heap_top(struct cache *ca)
186{ 195{
187 return GC_SECTORS_USED(heap_peek(&ca->heap)); 196 struct bucket *b;
197 return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
188} 198}
189 199
190void bch_moving_gc(struct cache_set *c) 200void bch_moving_gc(struct cache_set *c)
@@ -226,9 +236,8 @@ void bch_moving_gc(struct cache_set *c)
226 sectors_to_move -= GC_SECTORS_USED(b); 236 sectors_to_move -= GC_SECTORS_USED(b);
227 } 237 }
228 238
229 ca->gc_move_threshold = bucket_heap_top(ca); 239 while (heap_pop(&ca->heap, b, bucket_cmp))
230 240 SET_GC_MOVE(b, 1);
231 pr_debug("threshold %u", ca->gc_move_threshold);
232 } 241 }
233 242
234 mutex_unlock(&c->bucket_lock); 243 mutex_unlock(&c->bucket_lock);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 60fb6044b953..93d593f957f6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1672,7 +1672,7 @@ err:
1672static bool can_attach_cache(struct cache *ca, struct cache_set *c) 1672static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1673{ 1673{
1674 return ca->sb.block_size == c->sb.block_size && 1674 return ca->sb.block_size == c->sb.block_size &&
1675 ca->sb.bucket_size == c->sb.block_size && 1675 ca->sb.bucket_size == c->sb.bucket_size &&
1676 ca->sb.nr_in_set == c->sb.nr_in_set; 1676 ca->sb.nr_in_set == c->sb.nr_in_set;
1677} 1677}
1678 1678
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 80d4c2bee18a..a1f85612f0b3 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -83,7 +83,6 @@ rw_attribute(writeback_rate);
83rw_attribute(writeback_rate_update_seconds); 83rw_attribute(writeback_rate_update_seconds);
84rw_attribute(writeback_rate_d_term); 84rw_attribute(writeback_rate_d_term);
85rw_attribute(writeback_rate_p_term_inverse); 85rw_attribute(writeback_rate_p_term_inverse);
86rw_attribute(writeback_rate_d_smooth);
87read_attribute(writeback_rate_debug); 86read_attribute(writeback_rate_debug);
88 87
89read_attribute(stripe_size); 88read_attribute(stripe_size);
@@ -129,31 +128,41 @@ SHOW(__bch_cached_dev)
129 var_printf(writeback_running, "%i"); 128 var_printf(writeback_running, "%i");
130 var_print(writeback_delay); 129 var_print(writeback_delay);
131 var_print(writeback_percent); 130 var_print(writeback_percent);
132 sysfs_print(writeback_rate, dc->writeback_rate.rate); 131 sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
133 132
134 var_print(writeback_rate_update_seconds); 133 var_print(writeback_rate_update_seconds);
135 var_print(writeback_rate_d_term); 134 var_print(writeback_rate_d_term);
136 var_print(writeback_rate_p_term_inverse); 135 var_print(writeback_rate_p_term_inverse);
137 var_print(writeback_rate_d_smooth);
138 136
139 if (attr == &sysfs_writeback_rate_debug) { 137 if (attr == &sysfs_writeback_rate_debug) {
138 char rate[20];
140 char dirty[20]; 139 char dirty[20];
141 char derivative[20];
142 char target[20]; 140 char target[20];
143 bch_hprint(dirty, 141 char proportional[20];
144 bcache_dev_sectors_dirty(&dc->disk) << 9); 142 char derivative[20];
145 bch_hprint(derivative, dc->writeback_rate_derivative << 9); 143 char change[20];
144 s64 next_io;
145
146 bch_hprint(rate, dc->writeback_rate.rate << 9);
147 bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
146 bch_hprint(target, dc->writeback_rate_target << 9); 148 bch_hprint(target, dc->writeback_rate_target << 9);
149 bch_hprint(proportional,dc->writeback_rate_proportional << 9);
150 bch_hprint(derivative, dc->writeback_rate_derivative << 9);
151 bch_hprint(change, dc->writeback_rate_change << 9);
152
153 next_io = div64_s64(dc->writeback_rate.next - local_clock(),
154 NSEC_PER_MSEC);
147 155
148 return sprintf(buf, 156 return sprintf(buf,
149 "rate:\t\t%u\n" 157 "rate:\t\t%s/sec\n"
150 "change:\t\t%i\n"
151 "dirty:\t\t%s\n" 158 "dirty:\t\t%s\n"
159 "target:\t\t%s\n"
160 "proportional:\t%s\n"
152 "derivative:\t%s\n" 161 "derivative:\t%s\n"
153 "target:\t\t%s\n", 162 "change:\t\t%s/sec\n"
154 dc->writeback_rate.rate, 163 "next io:\t%llims\n",
155 dc->writeback_rate_change, 164 rate, dirty, target, proportional,
156 dirty, derivative, target); 165 derivative, change, next_io);
157 } 166 }
158 167
159 sysfs_hprint(dirty_data, 168 sysfs_hprint(dirty_data,
@@ -189,6 +198,7 @@ STORE(__cached_dev)
189 struct kobj_uevent_env *env; 198 struct kobj_uevent_env *env;
190 199
191#define d_strtoul(var) sysfs_strtoul(var, dc->var) 200#define d_strtoul(var) sysfs_strtoul(var, dc->var)
201#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
192#define d_strtoi_h(var) sysfs_hatoi(var, dc->var) 202#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
193 203
194 sysfs_strtoul(data_csum, dc->disk.data_csum); 204 sysfs_strtoul(data_csum, dc->disk.data_csum);
@@ -197,16 +207,15 @@ STORE(__cached_dev)
197 d_strtoul(writeback_metadata); 207 d_strtoul(writeback_metadata);
198 d_strtoul(writeback_running); 208 d_strtoul(writeback_running);
199 d_strtoul(writeback_delay); 209 d_strtoul(writeback_delay);
200 sysfs_strtoul_clamp(writeback_rate, 210
201 dc->writeback_rate.rate, 1, 1000000);
202 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); 211 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
203 212
204 d_strtoul(writeback_rate_update_seconds); 213 sysfs_strtoul_clamp(writeback_rate,
214 dc->writeback_rate.rate, 1, INT_MAX);
215
216 d_strtoul_nonzero(writeback_rate_update_seconds);
205 d_strtoul(writeback_rate_d_term); 217 d_strtoul(writeback_rate_d_term);
206 d_strtoul(writeback_rate_p_term_inverse); 218 d_strtoul_nonzero(writeback_rate_p_term_inverse);
207 sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
208 dc->writeback_rate_p_term_inverse, 1, INT_MAX);
209 d_strtoul(writeback_rate_d_smooth);
210 219
211 d_strtoi_h(sequential_cutoff); 220 d_strtoi_h(sequential_cutoff);
212 d_strtoi_h(readahead); 221 d_strtoi_h(readahead);
@@ -313,7 +322,6 @@ static struct attribute *bch_cached_dev_files[] = {
313 &sysfs_writeback_rate_update_seconds, 322 &sysfs_writeback_rate_update_seconds,
314 &sysfs_writeback_rate_d_term, 323 &sysfs_writeback_rate_d_term,
315 &sysfs_writeback_rate_p_term_inverse, 324 &sysfs_writeback_rate_p_term_inverse,
316 &sysfs_writeback_rate_d_smooth,
317 &sysfs_writeback_rate_debug, 325 &sysfs_writeback_rate_debug,
318 &sysfs_dirty_data, 326 &sysfs_dirty_data,
319 &sysfs_stripe_size, 327 &sysfs_stripe_size,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index c57621e49dc0..db3ae4c2b223 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -209,7 +209,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
209{ 209{
210 uint64_t now = local_clock(); 210 uint64_t now = local_clock();
211 211
212 d->next += div_u64(done, d->rate); 212 d->next += div_u64(done * NSEC_PER_SEC, d->rate);
213
214 if (time_before64(now + NSEC_PER_SEC, d->next))
215 d->next = now + NSEC_PER_SEC;
216
217 if (time_after64(now - NSEC_PER_SEC * 2, d->next))
218 d->next = now - NSEC_PER_SEC * 2;
213 219
214 return time_after64(d->next, now) 220 return time_after64(d->next, now)
215 ? div_u64(d->next - now, NSEC_PER_SEC / HZ) 221 ? div_u64(d->next - now, NSEC_PER_SEC / HZ)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 362c4b3f8b4a..1030c6020e98 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -110,7 +110,7 @@ do { \
110 _r; \ 110 _r; \
111}) 111})
112 112
113#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) 113#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL)
114 114
115#define heap_full(h) ((h)->used == (h)->size) 115#define heap_full(h) ((h)->used == (h)->size)
116 116
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 04657e93f4fd..f4300e4c0114 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -30,38 +30,40 @@ static void __update_writeback_rate(struct cached_dev *dc)
30 30
31 /* PD controller */ 31 /* PD controller */
32 32
33 int change = 0;
34 int64_t error;
35 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 33 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
36 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 34 int64_t derivative = dirty - dc->disk.sectors_dirty_last;
35 int64_t proportional = dirty - target;
36 int64_t change;
37 37
38 dc->disk.sectors_dirty_last = dirty; 38 dc->disk.sectors_dirty_last = dirty;
39 39
40 derivative *= dc->writeback_rate_d_term; 40 /* Scale to sectors per second */
41 derivative = clamp(derivative, -dirty, dirty);
42 41
43 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, 42 proportional *= dc->writeback_rate_update_seconds;
44 dc->writeback_rate_d_smooth, 0); 43 proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
45 44
46 /* Avoid divide by zero */ 45 derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
47 if (!target)
48 goto out;
49 46
50 error = div64_s64((dirty + derivative - target) << 8, target); 47 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
48 (dc->writeback_rate_d_term /
49 dc->writeback_rate_update_seconds) ?: 1, 0);
50
51 derivative *= dc->writeback_rate_d_term;
52 derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
51 53
52 change = div_s64((dc->writeback_rate.rate * error) >> 8, 54 change = proportional + derivative;
53 dc->writeback_rate_p_term_inverse);
54 55
55 /* Don't increase writeback rate if the device isn't keeping up */ 56 /* Don't increase writeback rate if the device isn't keeping up */
56 if (change > 0 && 57 if (change > 0 &&
57 time_after64(local_clock(), 58 time_after64(local_clock(),
58 dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) 59 dc->writeback_rate.next + NSEC_PER_MSEC))
59 change = 0; 60 change = 0;
60 61
61 dc->writeback_rate.rate = 62 dc->writeback_rate.rate =
62 clamp_t(int64_t, dc->writeback_rate.rate + change, 63 clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
63 1, NSEC_PER_MSEC); 64 1, NSEC_PER_MSEC);
64out: 65
66 dc->writeback_rate_proportional = proportional;
65 dc->writeback_rate_derivative = derivative; 67 dc->writeback_rate_derivative = derivative;
66 dc->writeback_rate_change = change; 68 dc->writeback_rate_change = change;
67 dc->writeback_rate_target = target; 69 dc->writeback_rate_target = target;
@@ -87,15 +89,11 @@ static void update_writeback_rate(struct work_struct *work)
87 89
88static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 90static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
89{ 91{
90 uint64_t ret;
91
92 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 92 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
93 !dc->writeback_percent) 93 !dc->writeback_percent)
94 return 0; 94 return 0;
95 95
96 ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 96 return bch_next_delay(&dc->writeback_rate, sectors);
97
98 return min_t(uint64_t, ret, HZ);
99} 97}
100 98
101struct dirty_io { 99struct dirty_io {
@@ -241,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
241 if (KEY_START(&w->key) != dc->last_read || 239 if (KEY_START(&w->key) != dc->last_read ||
242 jiffies_to_msecs(delay) > 50) 240 jiffies_to_msecs(delay) > 50)
243 while (!kthread_should_stop() && delay) 241 while (!kthread_should_stop() && delay)
244 delay = schedule_timeout_interruptible(delay); 242 delay = schedule_timeout_uninterruptible(delay);
245 243
246 dc->last_read = KEY_OFFSET(&w->key); 244 dc->last_read = KEY_OFFSET(&w->key);
247 245
@@ -438,7 +436,7 @@ static int bch_writeback_thread(void *arg)
438 while (delay && 436 while (delay &&
439 !kthread_should_stop() && 437 !kthread_should_stop() &&
440 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) 438 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
441 delay = schedule_timeout_interruptible(delay); 439 delay = schedule_timeout_uninterruptible(delay);
442 } 440 }
443 } 441 }
444 442
@@ -476,6 +474,8 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
476 474
477 bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), 475 bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
478 sectors_dirty_init_fn, 0); 476 sectors_dirty_init_fn, 0);
477
478 dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
479} 479}
480 480
481int bch_cached_dev_writeback_init(struct cached_dev *dc) 481int bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -490,18 +490,15 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
490 dc->writeback_delay = 30; 490 dc->writeback_delay = 30;
491 dc->writeback_rate.rate = 1024; 491 dc->writeback_rate.rate = 1024;
492 492
493 dc->writeback_rate_update_seconds = 30; 493 dc->writeback_rate_update_seconds = 5;
494 dc->writeback_rate_d_term = 16; 494 dc->writeback_rate_d_term = 30;
495 dc->writeback_rate_p_term_inverse = 64; 495 dc->writeback_rate_p_term_inverse = 6000;
496 dc->writeback_rate_d_smooth = 8;
497 496
498 dc->writeback_thread = kthread_create(bch_writeback_thread, dc, 497 dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
499 "bcache_writeback"); 498 "bcache_writeback");
500 if (IS_ERR(dc->writeback_thread)) 499 if (IS_ERR(dc->writeback_thread))
501 return PTR_ERR(dc->writeback_thread); 500 return PTR_ERR(dc->writeback_thread);
502 501
503 set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE);
504
505 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 502 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
506 schedule_delayed_work(&dc->writeback_rate_update, 503 schedule_delayed_work(&dc->writeback_rate_update,
507 dc->writeback_rate_update_seconds * HZ); 504 dc->writeback_rate_update_seconds * HZ);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 4113b6044b80..a1b58a65d8ed 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void)
1717{ 1717{
1718 __u64 mem; 1718 __u64 mem;
1719 1719
1720 dm_bufio_allocated_kmem_cache = 0;
1721 dm_bufio_allocated_get_free_pages = 0;
1722 dm_bufio_allocated_vmalloc = 0;
1723 dm_bufio_current_allocated = 0;
1724
1720 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1725 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
1721 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1726 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
1722 1727
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index bfba97dcde2d..d13a16865d03 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
730 int r = 0; 730 int r = 0;
731 bool updated = updated_this_tick(mq, e); 731 bool updated = updated_this_tick(mq, e);
732 732
733 requeue_and_update_tick(mq, e);
734
735 if ((!discarded_oblock && updated) || 733 if ((!discarded_oblock && updated) ||
736 !should_promote(mq, e, discarded_oblock, data_dir)) 734 !should_promote(mq, e, discarded_oblock, data_dir)) {
735 requeue_and_update_tick(mq, e);
737 result->op = POLICY_MISS; 736 result->op = POLICY_MISS;
738 else if (!can_migrate) 737
738 } else if (!can_migrate)
739 r = -EWOULDBLOCK; 739 r = -EWOULDBLOCK;
740 else 740
741 else {
742 requeue_and_update_tick(mq, e);
741 r = pre_cache_to_cache(mq, e, result); 743 r = pre_cache_to_cache(mq, e, result);
744 }
742 745
743 return r; 746 return r;
744} 747}
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7c8dd1f69ce0..99f91628a33a 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2763,7 +2763,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2763{ 2763{
2764 int r; 2764 int r;
2765 2765
2766 r = dm_cache_resize(cache->cmd, cache->cache_size); 2766 r = dm_cache_resize(cache->cmd, new_size);
2767 if (r) { 2767 if (r) {
2768 DMERR("could not resize cache metadata"); 2768 DMERR("could not resize cache metadata");
2769 return r; 2769 return r;
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 84c860191a2e..fc8482a65dd2 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -20,6 +20,7 @@
20struct delay_c { 20struct delay_c {
21 struct timer_list delay_timer; 21 struct timer_list delay_timer;
22 struct mutex timer_lock; 22 struct mutex timer_lock;
23 struct workqueue_struct *kdelayd_wq;
23 struct work_struct flush_expired_bios; 24 struct work_struct flush_expired_bios;
24 struct list_head delayed_bios; 25 struct list_head delayed_bios;
25 atomic_t may_delay; 26 atomic_t may_delay;
@@ -45,14 +46,13 @@ struct dm_delay_info {
45 46
46static DEFINE_MUTEX(delayed_bios_lock); 47static DEFINE_MUTEX(delayed_bios_lock);
47 48
48static struct workqueue_struct *kdelayd_wq;
49static struct kmem_cache *delayed_cache; 49static struct kmem_cache *delayed_cache;
50 50
51static void handle_delayed_timer(unsigned long data) 51static void handle_delayed_timer(unsigned long data)
52{ 52{
53 struct delay_c *dc = (struct delay_c *)data; 53 struct delay_c *dc = (struct delay_c *)data;
54 54
55 queue_work(kdelayd_wq, &dc->flush_expired_bios); 55 queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
56} 56}
57 57
58static void queue_timeout(struct delay_c *dc, unsigned long expires) 58static void queue_timeout(struct delay_c *dc, unsigned long expires)
@@ -191,6 +191,12 @@ out:
191 goto bad_dev_write; 191 goto bad_dev_write;
192 } 192 }
193 193
194 dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
195 if (!dc->kdelayd_wq) {
196 DMERR("Couldn't start kdelayd");
197 goto bad_queue;
198 }
199
194 setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); 200 setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
195 201
196 INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); 202 INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
@@ -203,6 +209,8 @@ out:
203 ti->private = dc; 209 ti->private = dc;
204 return 0; 210 return 0;
205 211
212bad_queue:
213 mempool_destroy(dc->delayed_pool);
206bad_dev_write: 214bad_dev_write:
207 if (dc->dev_write) 215 if (dc->dev_write)
208 dm_put_device(ti, dc->dev_write); 216 dm_put_device(ti, dc->dev_write);
@@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti)
217{ 225{
218 struct delay_c *dc = ti->private; 226 struct delay_c *dc = ti->private;
219 227
220 flush_workqueue(kdelayd_wq); 228 destroy_workqueue(dc->kdelayd_wq);
221 229
222 dm_put_device(ti, dc->dev_read); 230 dm_put_device(ti, dc->dev_read);
223 231
@@ -351,12 +359,6 @@ static int __init dm_delay_init(void)
351{ 359{
352 int r = -ENOMEM; 360 int r = -ENOMEM;
353 361
354 kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
355 if (!kdelayd_wq) {
356 DMERR("Couldn't start kdelayd");
357 goto bad_queue;
358 }
359
360 delayed_cache = KMEM_CACHE(dm_delay_info, 0); 362 delayed_cache = KMEM_CACHE(dm_delay_info, 0);
361 if (!delayed_cache) { 363 if (!delayed_cache) {
362 DMERR("Couldn't create delayed bio cache."); 364 DMERR("Couldn't create delayed bio cache.");
@@ -374,8 +376,6 @@ static int __init dm_delay_init(void)
374bad_register: 376bad_register:
375 kmem_cache_destroy(delayed_cache); 377 kmem_cache_destroy(delayed_cache);
376bad_memcache: 378bad_memcache:
377 destroy_workqueue(kdelayd_wq);
378bad_queue:
379 return r; 379 return r;
380} 380}
381 381
@@ -383,7 +383,6 @@ static void __exit dm_delay_exit(void)
383{ 383{
384 dm_unregister_target(&delay_target); 384 dm_unregister_target(&delay_target);
385 kmem_cache_destroy(delayed_cache); 385 kmem_cache_destroy(delayed_cache);
386 destroy_workqueue(kdelayd_wq);
387} 386}
388 387
389/* Module hooks */ 388/* Module hooks */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 80b5cabbea29..01b6a11813f2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -66,6 +66,18 @@ struct dm_snapshot {
66 66
67 atomic_t pending_exceptions_count; 67 atomic_t pending_exceptions_count;
68 68
69 /* Protected by "lock" */
70 sector_t exception_start_sequence;
71
72 /* Protected by kcopyd single-threaded callback */
73 sector_t exception_complete_sequence;
74
75 /*
76 * A list of pending exceptions that completed out of order.
77 * Protected by kcopyd single-threaded callback.
78 */
79 struct list_head out_of_order_list;
80
69 mempool_t *pending_pool; 81 mempool_t *pending_pool;
70 82
71 struct dm_exception_table pending; 83 struct dm_exception_table pending;
@@ -173,6 +185,14 @@ struct dm_snap_pending_exception {
173 */ 185 */
174 int started; 186 int started;
175 187
188 /* There was copying error. */
189 int copy_error;
190
191 /* A sequence number, it is used for in-order completion. */
192 sector_t exception_sequence;
193
194 struct list_head out_of_order_entry;
195
176 /* 196 /*
177 * For writing a complete chunk, bypassing the copy. 197 * For writing a complete chunk, bypassing the copy.
178 */ 198 */
@@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1094 s->valid = 1; 1114 s->valid = 1;
1095 s->active = 0; 1115 s->active = 0;
1096 atomic_set(&s->pending_exceptions_count, 0); 1116 atomic_set(&s->pending_exceptions_count, 0);
1117 s->exception_start_sequence = 0;
1118 s->exception_complete_sequence = 0;
1119 INIT_LIST_HEAD(&s->out_of_order_list);
1097 init_rwsem(&s->lock); 1120 init_rwsem(&s->lock);
1098 INIT_LIST_HEAD(&s->list); 1121 INIT_LIST_HEAD(&s->list);
1099 spin_lock_init(&s->pe_lock); 1122 spin_lock_init(&s->pe_lock);
@@ -1444,6 +1467,19 @@ static void commit_callback(void *context, int success)
1444 pending_complete(pe, success); 1467 pending_complete(pe, success);
1445} 1468}
1446 1469
1470static void complete_exception(struct dm_snap_pending_exception *pe)
1471{
1472 struct dm_snapshot *s = pe->snap;
1473
1474 if (unlikely(pe->copy_error))
1475 pending_complete(pe, 0);
1476
1477 else
1478 /* Update the metadata if we are persistent */
1479 s->store->type->commit_exception(s->store, &pe->e,
1480 commit_callback, pe);
1481}
1482
1447/* 1483/*
1448 * Called when the copy I/O has finished. kcopyd actually runs 1484 * Called when the copy I/O has finished. kcopyd actually runs
1449 * this code so don't block. 1485 * this code so don't block.
@@ -1453,13 +1489,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
1453 struct dm_snap_pending_exception *pe = context; 1489 struct dm_snap_pending_exception *pe = context;
1454 struct dm_snapshot *s = pe->snap; 1490 struct dm_snapshot *s = pe->snap;
1455 1491
1456 if (read_err || write_err) 1492 pe->copy_error = read_err || write_err;
1457 pending_complete(pe, 0);
1458 1493
1459 else 1494 if (pe->exception_sequence == s->exception_complete_sequence) {
1460 /* Update the metadata if we are persistent */ 1495 s->exception_complete_sequence++;
1461 s->store->type->commit_exception(s->store, &pe->e, 1496 complete_exception(pe);
1462 commit_callback, pe); 1497
1498 while (!list_empty(&s->out_of_order_list)) {
1499 pe = list_entry(s->out_of_order_list.next,
1500 struct dm_snap_pending_exception, out_of_order_entry);
1501 if (pe->exception_sequence != s->exception_complete_sequence)
1502 break;
1503 s->exception_complete_sequence++;
1504 list_del(&pe->out_of_order_entry);
1505 complete_exception(pe);
1506 }
1507 } else {
1508 struct list_head *lh;
1509 struct dm_snap_pending_exception *pe2;
1510
1511 list_for_each_prev(lh, &s->out_of_order_list) {
1512 pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
1513 if (pe2->exception_sequence < pe->exception_sequence)
1514 break;
1515 }
1516 list_add(&pe->out_of_order_entry, lh);
1517 }
1463} 1518}
1464 1519
1465/* 1520/*
@@ -1554,6 +1609,8 @@ __find_pending_exception(struct dm_snapshot *s,
1554 return NULL; 1609 return NULL;
1555 } 1610 }
1556 1611
1612 pe->exception_sequence = s->exception_start_sequence++;
1613
1557 dm_insert_exception(&s->pending, &pe->e); 1614 dm_insert_exception(&s->pending, &pe->e);
1558 1615
1559 return pe; 1616 return pe;
@@ -2193,7 +2250,7 @@ static struct target_type origin_target = {
2193 2250
2194static struct target_type snapshot_target = { 2251static struct target_type snapshot_target = {
2195 .name = "snapshot", 2252 .name = "snapshot",
2196 .version = {1, 11, 1}, 2253 .version = {1, 12, 0},
2197 .module = THIS_MODULE, 2254 .module = THIS_MODULE,
2198 .ctr = snapshot_ctr, 2255 .ctr = snapshot_ctr,
2199 .dtr = snapshot_dtr, 2256 .dtr = snapshot_dtr,
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 3d404c1371ed..28a90122a5a8 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
964 964
965int __init dm_statistics_init(void) 965int __init dm_statistics_init(void)
966{ 966{
967 shared_memory_amount = 0;
967 dm_stat_need_rcu_barrier = 0; 968 dm_stat_need_rcu_barrier = 0;
968 return 0; 969 return 0;
969} 970}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 465f08ca62b1..3ba6a3859ce3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
200 200
201 num_targets = dm_round_up(num_targets, KEYS_PER_NODE); 201 num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
202 202
203 if (!num_targets) {
204 kfree(t);
205 return -ENOMEM;
206 }
207
203 if (alloc_targets(t, num_targets)) { 208 if (alloc_targets(t, num_targets)) {
204 kfree(t); 209 kfree(t);
205 return -ENOMEM; 210 return -ENOMEM;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 60bce435f4fa..8a30ad54bd46 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1697 up_write(&pmd->root_lock); 1697 up_write(&pmd->root_lock);
1698} 1698}
1699 1699
1700void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
1701{
1702 down_write(&pmd->root_lock);
1703 pmd->read_only = false;
1704 dm_bm_set_read_write(pmd->bm);
1705 up_write(&pmd->root_lock);
1706}
1707
1700int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, 1708int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1701 dm_block_t threshold, 1709 dm_block_t threshold,
1702 dm_sm_threshold_fn fn, 1710 dm_sm_threshold_fn fn,
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 845ebbe589a9..7bcc0e1d6238 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz
193 * that nothing is changing. 193 * that nothing is changing.
194 */ 194 */
195void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); 195void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
196void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd);
196 197
197int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, 198int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
198 dm_block_t threshold, 199 dm_block_t threshold,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1abb4a24c338..357eb272dbd9 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -645,7 +645,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
645 */ 645 */
646 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 646 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
647 if (r) { 647 if (r) {
648 DMERR_LIMIT("dm_thin_insert_block() failed"); 648 DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
649 dm_device_name(pool->pool_md), r);
650 set_pool_mode(pool, PM_READ_ONLY);
649 cell_error(pool, m->cell); 651 cell_error(pool, m->cell);
650 goto out; 652 goto out;
651 } 653 }
@@ -887,32 +889,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
887 } 889 }
888} 890}
889 891
890static int commit(struct pool *pool)
891{
892 int r;
893
894 r = dm_pool_commit_metadata(pool->pmd);
895 if (r)
896 DMERR_LIMIT("%s: commit failed: error = %d",
897 dm_device_name(pool->pool_md), r);
898
899 return r;
900}
901
902/* 892/*
903 * A non-zero return indicates read_only or fail_io mode. 893 * A non-zero return indicates read_only or fail_io mode.
904 * Many callers don't care about the return value. 894 * Many callers don't care about the return value.
905 */ 895 */
906static int commit_or_fallback(struct pool *pool) 896static int commit(struct pool *pool)
907{ 897{
908 int r; 898 int r;
909 899
910 if (get_pool_mode(pool) != PM_WRITE) 900 if (get_pool_mode(pool) != PM_WRITE)
911 return -EINVAL; 901 return -EINVAL;
912 902
913 r = commit(pool); 903 r = dm_pool_commit_metadata(pool->pmd);
914 if (r) 904 if (r) {
905 DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
906 dm_device_name(pool->pool_md), r);
915 set_pool_mode(pool, PM_READ_ONLY); 907 set_pool_mode(pool, PM_READ_ONLY);
908 }
916 909
917 return r; 910 return r;
918} 911}
@@ -949,7 +942,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
949 * Try to commit to see if that will free up some 942 * Try to commit to see if that will free up some
950 * more space. 943 * more space.
951 */ 944 */
952 (void) commit_or_fallback(pool); 945 r = commit(pool);
946 if (r)
947 return r;
953 948
954 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 949 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
955 if (r) 950 if (r)
@@ -963,7 +958,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
963 * table reload). 958 * table reload).
964 */ 959 */
965 if (!free_blocks) { 960 if (!free_blocks) {
966 DMWARN("%s: no free space available.", 961 DMWARN("%s: no free data space available.",
967 dm_device_name(pool->pool_md)); 962 dm_device_name(pool->pool_md));
968 spin_lock_irqsave(&pool->lock, flags); 963 spin_lock_irqsave(&pool->lock, flags);
969 pool->no_free_space = 1; 964 pool->no_free_space = 1;
@@ -973,8 +968,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
973 } 968 }
974 969
975 r = dm_pool_alloc_data_block(pool->pmd, result); 970 r = dm_pool_alloc_data_block(pool->pmd, result);
976 if (r) 971 if (r) {
972 if (r == -ENOSPC &&
973 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
974 !free_blocks) {
975 DMWARN("%s: no free metadata space available.",
976 dm_device_name(pool->pool_md));
977 set_pool_mode(pool, PM_READ_ONLY);
978 }
977 return r; 979 return r;
980 }
978 981
979 return 0; 982 return 0;
980} 983}
@@ -1355,7 +1358,7 @@ static void process_deferred_bios(struct pool *pool)
1355 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1358 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1356 return; 1359 return;
1357 1360
1358 if (commit_or_fallback(pool)) { 1361 if (commit(pool)) {
1359 while ((bio = bio_list_pop(&bios))) 1362 while ((bio = bio_list_pop(&bios)))
1360 bio_io_error(bio); 1363 bio_io_error(bio);
1361 return; 1364 return;
@@ -1403,6 +1406,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1403 case PM_FAIL: 1406 case PM_FAIL:
1404 DMERR("%s: switching pool to failure mode", 1407 DMERR("%s: switching pool to failure mode",
1405 dm_device_name(pool->pool_md)); 1408 dm_device_name(pool->pool_md));
1409 dm_pool_metadata_read_only(pool->pmd);
1406 pool->process_bio = process_bio_fail; 1410 pool->process_bio = process_bio_fail;
1407 pool->process_discard = process_bio_fail; 1411 pool->process_discard = process_bio_fail;
1408 pool->process_prepared_mapping = process_prepared_mapping_fail; 1412 pool->process_prepared_mapping = process_prepared_mapping_fail;
@@ -1427,6 +1431,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1427 break; 1431 break;
1428 1432
1429 case PM_WRITE: 1433 case PM_WRITE:
1434 dm_pool_metadata_read_write(pool->pmd);
1430 pool->process_bio = process_bio; 1435 pool->process_bio = process_bio;
1431 pool->process_discard = process_discard; 1436 pool->process_discard = process_discard;
1432 pool->process_prepared_mapping = process_prepared_mapping; 1437 pool->process_prepared_mapping = process_prepared_mapping;
@@ -1643,12 +1648,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1643 struct pool_c *pt = ti->private; 1648 struct pool_c *pt = ti->private;
1644 1649
1645 /* 1650 /*
1646 * We want to make sure that degraded pools are never upgraded. 1651 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
1647 */ 1652 */
1648 enum pool_mode old_mode = pool->pf.mode; 1653 enum pool_mode old_mode = pool->pf.mode;
1649 enum pool_mode new_mode = pt->adjusted_pf.mode; 1654 enum pool_mode new_mode = pt->adjusted_pf.mode;
1650 1655
1651 if (old_mode > new_mode) 1656 /*
1657 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1658 * not going to recover without a thin_repair. So we never let the
1659 * pool move out of the old mode. On the other hand a PM_READ_ONLY
1660 * may have been due to a lack of metadata or data space, and may
1661 * now work (ie. if the underlying devices have been resized).
1662 */
1663 if (old_mode == PM_FAIL)
1652 new_mode = old_mode; 1664 new_mode = old_mode;
1653 1665
1654 pool->ti = ti; 1666 pool->ti = ti;
@@ -2272,7 +2284,7 @@ static int pool_preresume(struct dm_target *ti)
2272 return r; 2284 return r;
2273 2285
2274 if (need_commit1 || need_commit2) 2286 if (need_commit1 || need_commit2)
2275 (void) commit_or_fallback(pool); 2287 (void) commit(pool);
2276 2288
2277 return 0; 2289 return 0;
2278} 2290}
@@ -2299,7 +2311,7 @@ static void pool_postsuspend(struct dm_target *ti)
2299 2311
2300 cancel_delayed_work(&pool->waker); 2312 cancel_delayed_work(&pool->waker);
2301 flush_workqueue(pool->wq); 2313 flush_workqueue(pool->wq);
2302 (void) commit_or_fallback(pool); 2314 (void) commit(pool);
2303} 2315}
2304 2316
2305static int check_arg_count(unsigned argc, unsigned args_required) 2317static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2433,7 +2445,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
2433 if (r) 2445 if (r)
2434 return r; 2446 return r;
2435 2447
2436 (void) commit_or_fallback(pool); 2448 (void) commit(pool);
2437 2449
2438 r = dm_pool_reserve_metadata_snap(pool->pmd); 2450 r = dm_pool_reserve_metadata_snap(pool->pmd);
2439 if (r) 2451 if (r)
@@ -2495,7 +2507,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2495 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2507 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2496 2508
2497 if (!r) 2509 if (!r)
2498 (void) commit_or_fallback(pool); 2510 (void) commit(pool);
2499 2511
2500 return r; 2512 return r;
2501} 2513}
@@ -2550,7 +2562,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2550 2562
2551 /* Commit to ensure statistics aren't out-of-date */ 2563 /* Commit to ensure statistics aren't out-of-date */
2552 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 2564 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2553 (void) commit_or_fallback(pool); 2565 (void) commit(pool);
2554 2566
2555 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); 2567 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2556 if (r) { 2568 if (r) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b07fed398fd7..16d84e091e2d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7765,7 +7765,7 @@ void md_check_recovery(struct mddev *mddev)
7765 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 7765 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7766 return; 7766 return;
7767 if ( ! ( 7767 if ( ! (
7768 (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || 7768 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
7769 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 7769 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7770 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 7770 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
7771 (mddev->external == 0 && mddev->safemode == 1) || 7771 (mddev->external == 0 && mddev->safemode == 1) ||
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index af96e24ec328..1d75b1dc1e2e 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
317 * The shadow op will often be a noop. Only insert if it really 317 * The shadow op will often be a noop. Only insert if it really
318 * copied data. 318 * copied data.
319 */ 319 */
320 if (dm_block_location(*block) != b) 320 if (dm_block_location(*block) != b) {
321 /*
322 * dm_tm_shadow_block will have already decremented the old
323 * block, but it is still referenced by the btree. We
324 * increment to stop the insert decrementing it below zero
325 * when overwriting the old value.
326 */
327 dm_tm_inc(info->btree_info.tm, b);
321 r = insert_ablock(info, index, *block, root); 328 r = insert_ablock(info, index, *block, root);
329 }
322 330
323 return r; 331 return r;
324} 332}
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a7e8bf296388..064a3c271baa 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm)
626} 626}
627EXPORT_SYMBOL_GPL(dm_bm_set_read_only); 627EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
628 628
629void dm_bm_set_read_write(struct dm_block_manager *bm)
630{
631 bm->read_only = false;
632}
633EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
634
629u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) 635u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
630{ 636{
631 return crc32c(~(u32) 0, data, len) ^ init_xor; 637 return crc32c(~(u32) 0, data, len) ^ init_xor;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 9a82083a66b6..13cd58e1fe69 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b);
108int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 108int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
109 struct dm_block *superblock); 109 struct dm_block *superblock);
110 110
111 /* 111/*
112 * Request data be prefetched into the cache. 112 * Request data is prefetched into the cache.
113 */ 113 */
114void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); 114void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
115 115
116/* 116/*
@@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
125 * be returned if you do. 125 * be returned if you do.
126 */ 126 */
127void dm_bm_set_read_only(struct dm_block_manager *bm); 127void dm_bm_set_read_only(struct dm_block_manager *bm);
128void dm_bm_set_read_write(struct dm_block_manager *bm);
128 129
129u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); 130u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
130 131
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 6058569fe86c..466a60bbd716 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
381} 381}
382 382
383static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, 383static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
384 uint32_t (*mutator)(void *context, uint32_t old), 384 int (*mutator)(void *context, uint32_t old, uint32_t *new),
385 void *context, enum allocation_event *ev) 385 void *context, enum allocation_event *ev)
386{ 386{
387 int r; 387 int r;
@@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
410 410
411 if (old > 2) { 411 if (old > 2) {
412 r = sm_ll_lookup_big_ref_count(ll, b, &old); 412 r = sm_ll_lookup_big_ref_count(ll, b, &old);
413 if (r < 0) 413 if (r < 0) {
414 dm_tm_unlock(ll->tm, nb);
414 return r; 415 return r;
416 }
415 } 417 }
416 418
417 ref_count = mutator(context, old); 419 r = mutator(context, old, &ref_count);
420 if (r) {
421 dm_tm_unlock(ll->tm, nb);
422 return r;
423 }
418 424
419 if (ref_count <= 2) { 425 if (ref_count <= 2) {
420 sm_set_bitmap(bm_le, bit, ref_count); 426 sm_set_bitmap(bm_le, bit, ref_count);
@@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
465 return ll->save_ie(ll, index, &ie_disk); 471 return ll->save_ie(ll, index, &ie_disk);
466} 472}
467 473
468static uint32_t set_ref_count(void *context, uint32_t old) 474static int set_ref_count(void *context, uint32_t old, uint32_t *new)
469{ 475{
470 return *((uint32_t *) context); 476 *new = *((uint32_t *) context);
477 return 0;
471} 478}
472 479
473int sm_ll_insert(struct ll_disk *ll, dm_block_t b, 480int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
@@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
476 return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); 483 return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
477} 484}
478 485
479static uint32_t inc_ref_count(void *context, uint32_t old) 486static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
480{ 487{
481 return old + 1; 488 *new = old + 1;
489 return 0;
482} 490}
483 491
484int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) 492int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
@@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
486 return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); 494 return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
487} 495}
488 496
489static uint32_t dec_ref_count(void *context, uint32_t old) 497static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
490{ 498{
491 return old - 1; 499 if (!old) {
500 DMERR_LIMIT("unable to decrement a reference count below 0");
501 return -EINVAL;
502 }
503
504 *new = old - 1;
505 return 0;
492} 506}
493 507
494int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) 508int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 1c959684caef..58fc1eef7499 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
384 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 384 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
385 385
386 int r = sm_metadata_new_block_(sm, b); 386 int r = sm_metadata_new_block_(sm, b);
387 if (r) 387 if (r) {
388 DMERR("unable to allocate new metadata block"); 388 DMERR("unable to allocate new metadata block");
389 return r;
390 }
389 391
390 r = sm_metadata_get_nr_free(sm, &count); 392 r = sm_metadata_get_nr_free(sm, &count);
391 if (r) 393 if (r) {
392 DMERR("couldn't get free block count"); 394 DMERR("couldn't get free block count");
395 return r;
396 }
393 397
394 check_threshold(&smm->threshold, count); 398 check_threshold(&smm->threshold, count);
395 399
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bef353c51c04..eea63372e4d3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -678,26 +678,23 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
678 } else 678 } else
679 init_stripe(sh, sector, previous); 679 init_stripe(sh, sector, previous);
680 } else { 680 } else {
681 spin_lock(&conf->device_lock);
681 if (atomic_read(&sh->count)) { 682 if (atomic_read(&sh->count)) {
682 BUG_ON(!list_empty(&sh->lru) 683 BUG_ON(!list_empty(&sh->lru)
683 && !test_bit(STRIPE_EXPANDING, &sh->state) 684 && !test_bit(STRIPE_EXPANDING, &sh->state)
684 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 685 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
685 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 686 );
686 } else { 687 } else {
687 spin_lock(&conf->device_lock);
688 if (!test_bit(STRIPE_HANDLE, &sh->state)) 688 if (!test_bit(STRIPE_HANDLE, &sh->state))
689 atomic_inc(&conf->active_stripes); 689 atomic_inc(&conf->active_stripes);
690 if (list_empty(&sh->lru) && 690 BUG_ON(list_empty(&sh->lru));
691 !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
692 !test_bit(STRIPE_EXPANDING, &sh->state))
693 BUG();
694 list_del_init(&sh->lru); 691 list_del_init(&sh->lru);
695 if (sh->group) { 692 if (sh->group) {
696 sh->group->stripes_cnt--; 693 sh->group->stripes_cnt--;
697 sh->group = NULL; 694 sh->group = NULL;
698 } 695 }
699 spin_unlock(&conf->device_lock);
700 } 696 }
697 spin_unlock(&conf->device_lock);
701 } 698 }
702 } while (sh == NULL); 699 } while (sh == NULL);
703 700
@@ -5473,7 +5470,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
5473 for (i = 0; i < *group_cnt; i++) { 5470 for (i = 0; i < *group_cnt; i++) {
5474 struct r5worker_group *group; 5471 struct r5worker_group *group;
5475 5472
5476 group = worker_groups[i]; 5473 group = &(*worker_groups)[i];
5477 INIT_LIST_HEAD(&group->handle_list); 5474 INIT_LIST_HEAD(&group->handle_list);
5478 group->conf = conf; 5475 group->conf = conf;
5479 group->workers = workers + i * cnt; 5476 group->workers = workers + i * cnt;