aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig15
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bcache/Makefile5
-rw-r--r--drivers/md/bcache/alloc.c89
-rw-r--r--drivers/md/bcache/bcache.h88
-rw-r--r--drivers/md/bcache/bset.c907
-rw-r--r--drivers/md/bcache/bset.h440
-rw-r--r--drivers/md/bcache/btree.c684
-rw-r--r--drivers/md/bcache/btree.h62
-rw-r--r--drivers/md/bcache/closure.c90
-rw-r--r--drivers/md/bcache/closure.h355
-rw-r--r--drivers/md/bcache/debug.c268
-rw-r--r--drivers/md/bcache/debug.h27
-rw-r--r--drivers/md/bcache/extents.c616
-rw-r--r--drivers/md/bcache/extents.h13
-rw-r--r--drivers/md/bcache/io.c196
-rw-r--r--drivers/md/bcache/journal.c87
-rw-r--r--drivers/md/bcache/journal.h1
-rw-r--r--drivers/md/bcache/movinggc.c6
-rw-r--r--drivers/md/bcache/request.c204
-rw-r--r--drivers/md/bcache/request.h21
-rw-r--r--drivers/md/bcache/super.c123
-rw-r--r--drivers/md/bcache/sysfs.c79
-rw-r--r--drivers/md/bcache/util.c4
-rw-r--r--drivers/md/bcache/util.h8
-rw-r--r--drivers/md/bcache/writeback.c6
-rw-r--r--drivers/md/bcache/writeback.h2
-rw-r--r--drivers/md/bitmap.c2
-rw-r--r--drivers/md/bitmap.h2
-rw-r--r--drivers/md/dm-bio-record.h37
-rw-r--r--drivers/md/dm-bufio.c38
-rw-r--r--drivers/md/dm-bufio.h12
-rw-r--r--drivers/md/dm-builtin.c48
-rw-r--r--drivers/md/dm-cache-policy-mq.c78
-rw-r--r--drivers/md/dm-cache-policy.c4
-rw-r--r--drivers/md/dm-cache-policy.h6
-rw-r--r--drivers/md/dm-cache-target.c68
-rw-r--r--drivers/md/dm-crypt.c64
-rw-r--r--drivers/md/dm-delay.c42
-rw-r--r--drivers/md/dm-flakey.c7
-rw-r--r--drivers/md/dm-io.c34
-rw-r--r--drivers/md/dm-linear.c3
-rw-r--r--drivers/md/dm-log-userspace-base.c206
-rw-r--r--drivers/md/dm-mpath.c7
-rw-r--r--drivers/md/dm-raid1.c23
-rw-r--r--drivers/md/dm-region-hash.c3
-rw-r--r--drivers/md/dm-snap-persistent.c90
-rw-r--r--drivers/md/dm-snap.c29
-rw-r--r--drivers/md/dm-stripe.c13
-rw-r--r--drivers/md/dm-switch.c4
-rw-r--r--drivers/md/dm-sysfs.c5
-rw-r--r--drivers/md/dm-table.c22
-rw-r--r--drivers/md/dm-thin-metadata.c78
-rw-r--r--drivers/md/dm-thin-metadata.h25
-rw-r--r--drivers/md/dm-thin.c567
-rw-r--r--drivers/md/dm-verity.c62
-rw-r--r--drivers/md/dm.c204
-rw-r--r--drivers/md/dm.h17
-rw-r--r--drivers/md/faulty.c19
-rw-r--r--drivers/md/linear.c96
-rw-r--r--drivers/md/md.c106
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/multipath.c13
-rw-r--r--drivers/md/persistent-data/Kconfig10
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c2
-rw-r--r--drivers/md/persistent-data/dm-btree.c33
-rw-r--r--drivers/md/persistent-data/dm-btree.h8
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c6
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c147
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h11
-rw-r--r--drivers/md/raid0.c79
-rw-r--r--drivers/md/raid1.c89
-rw-r--r--drivers/md/raid10.c209
-rw-r--r--drivers/md/raid5.c191
74 files changed, 4061 insertions, 3168 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index f2ccbc3b9fe4..95ad936e6048 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -176,8 +176,12 @@ config MD_FAULTY
176 176
177source "drivers/md/bcache/Kconfig" 177source "drivers/md/bcache/Kconfig"
178 178
179config BLK_DEV_DM_BUILTIN
180 boolean
181
179config BLK_DEV_DM 182config BLK_DEV_DM
180 tristate "Device mapper support" 183 tristate "Device mapper support"
184 select BLK_DEV_DM_BUILTIN
181 ---help--- 185 ---help---
182 Device-mapper is a low level volume manager. It works by allowing 186 Device-mapper is a low level volume manager. It works by allowing
183 people to specify mappings for ranges of logical sectors. Various 187 people to specify mappings for ranges of logical sectors. Various
@@ -238,6 +242,7 @@ config DM_CRYPT
238config DM_SNAPSHOT 242config DM_SNAPSHOT
239 tristate "Snapshot target" 243 tristate "Snapshot target"
240 depends on BLK_DEV_DM 244 depends on BLK_DEV_DM
245 select DM_BUFIO
241 ---help--- 246 ---help---
242 Allow volume managers to take writable snapshots of a device. 247 Allow volume managers to take writable snapshots of a device.
243 248
@@ -249,16 +254,6 @@ config DM_THIN_PROVISIONING
249 ---help--- 254 ---help---
250 Provides thin provisioning and snapshots that share a data store. 255 Provides thin provisioning and snapshots that share a data store.
251 256
252config DM_DEBUG_BLOCK_STACK_TRACING
253 boolean "Keep stack trace of thin provisioning block lock holders"
254 depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
255 select STACKTRACE
256 ---help---
257 Enable this for messages that may help debug problems with the
258 block manager locking used by thin provisioning.
259
260 If unsure, say N.
261
262config DM_CACHE 257config DM_CACHE
263 tristate "Cache target (EXPERIMENTAL)" 258 tristate "Cache target (EXPERIMENTAL)"
264 depends on BLK_DEV_DM 259 depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 2acc43fe0229..f26d83292579 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BCACHE) += bcache/ 32obj-$(CONFIG_BCACHE) += bcache/
33obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 33obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
34obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 34obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
35obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
35obj-$(CONFIG_DM_BUFIO) += dm-bufio.o 36obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
36obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o 37obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o
37obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 38obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 0e9c82523be6..c488b846f831 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -1,7 +1,8 @@
1 1
2obj-$(CONFIG_BCACHE) += bcache.o 2obj-$(CONFIG_BCACHE) += bcache.o
3 3
4bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ 4bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
5 movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o 5 io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
6 util.o writeback.o
6 7
7CFLAGS_request.o += -Iblock 8CFLAGS_request.o += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4c9852d92b0a..c0d37d082443 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
132{ 132{
133 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); 133 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
134 134
135 if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && 135 if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
136 CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) 136 unsigned i;
137 return false; 137
138 for (i = 0; i < RESERVE_NONE; i++)
139 if (!fifo_full(&ca->free[i]))
140 goto add;
138 141
142 return false;
143 }
144add:
139 b->prio = 0; 145 b->prio = 0;
140 146
141 if (can_inc_bucket_gen(b) && 147 if (can_inc_bucket_gen(b) &&
@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
162 fifo_push(&ca->free_inc, b - ca->buckets); 168 fifo_push(&ca->free_inc, b - ca->buckets);
163} 169}
164 170
165#define bucket_prio(b) \ 171/*
166 (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) 172 * Determines what order we're going to reuse buckets, smallest bucket_prio()
173 * first: we also take into account the number of sectors of live data in that
174 * bucket, and in order for that multiply to make sense we have to scale bucket
175 *
176 * Thus, we scale the bucket priorities so that the bucket with the smallest
177 * prio is worth 1/8th of what INITIAL_PRIO is worth.
178 */
179
180#define bucket_prio(b) \
181({ \
182 unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
183 \
184 (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
185})
167 186
168#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) 187#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
169#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) 188#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
@@ -304,6 +323,21 @@ do { \
304 __set_current_state(TASK_RUNNING); \ 323 __set_current_state(TASK_RUNNING); \
305} while (0) 324} while (0)
306 325
326static int bch_allocator_push(struct cache *ca, long bucket)
327{
328 unsigned i;
329
330 /* Prios/gens are actually the most important reserve */
331 if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
332 return true;
333
334 for (i = 0; i < RESERVE_NR; i++)
335 if (fifo_push(&ca->free[i], bucket))
336 return true;
337
338 return false;
339}
340
307static int bch_allocator_thread(void *arg) 341static int bch_allocator_thread(void *arg)
308{ 342{
309 struct cache *ca = arg; 343 struct cache *ca = arg;
@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg)
336 mutex_lock(&ca->set->bucket_lock); 370 mutex_lock(&ca->set->bucket_lock);
337 } 371 }
338 372
339 allocator_wait(ca, !fifo_full(&ca->free)); 373 allocator_wait(ca, bch_allocator_push(ca, bucket));
340
341 fifo_push(&ca->free, bucket);
342 wake_up(&ca->set->bucket_wait); 374 wake_up(&ca->set->bucket_wait);
343 } 375 }
344 376
@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg)
365 } 397 }
366} 398}
367 399
368long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) 400long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
369{ 401{
370 DEFINE_WAIT(w); 402 DEFINE_WAIT(w);
371 struct bucket *b; 403 struct bucket *b;
372 long r; 404 long r;
373 405
374 /* fastpath */ 406 /* fastpath */
375 if (fifo_used(&ca->free) > ca->watermark[watermark]) { 407 if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
376 fifo_pop(&ca->free, r); 408 fifo_pop(&ca->free[reserve], r))
377 goto out; 409 goto out;
378 }
379 410
380 if (!wait) 411 if (!wait)
381 return -1; 412 return -1;
382 413
383 while (1) { 414 do {
384 if (fifo_used(&ca->free) > ca->watermark[watermark]) {
385 fifo_pop(&ca->free, r);
386 break;
387 }
388
389 prepare_to_wait(&ca->set->bucket_wait, &w, 415 prepare_to_wait(&ca->set->bucket_wait, &w,
390 TASK_UNINTERRUPTIBLE); 416 TASK_UNINTERRUPTIBLE);
391 417
392 mutex_unlock(&ca->set->bucket_lock); 418 mutex_unlock(&ca->set->bucket_lock);
393 schedule(); 419 schedule();
394 mutex_lock(&ca->set->bucket_lock); 420 mutex_lock(&ca->set->bucket_lock);
395 } 421 } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
422 !fifo_pop(&ca->free[reserve], r));
396 423
397 finish_wait(&ca->set->bucket_wait, &w); 424 finish_wait(&ca->set->bucket_wait, &w);
398out: 425out:
@@ -401,12 +428,14 @@ out:
401 if (expensive_debug_checks(ca->set)) { 428 if (expensive_debug_checks(ca->set)) {
402 size_t iter; 429 size_t iter;
403 long i; 430 long i;
431 unsigned j;
404 432
405 for (iter = 0; iter < prio_buckets(ca) * 2; iter++) 433 for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
406 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); 434 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
407 435
408 fifo_for_each(i, &ca->free, iter) 436 for (j = 0; j < RESERVE_NR; j++)
409 BUG_ON(i == r); 437 fifo_for_each(i, &ca->free[j], iter)
438 BUG_ON(i == r);
410 fifo_for_each(i, &ca->free_inc, iter) 439 fifo_for_each(i, &ca->free_inc, iter)
411 BUG_ON(i == r); 440 BUG_ON(i == r);
412 fifo_for_each(i, &ca->unused, iter) 441 fifo_for_each(i, &ca->unused, iter)
@@ -419,7 +448,7 @@ out:
419 448
420 SET_GC_SECTORS_USED(b, ca->sb.bucket_size); 449 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
421 450
422 if (watermark <= WATERMARK_METADATA) { 451 if (reserve <= RESERVE_PRIO) {
423 SET_GC_MARK(b, GC_MARK_METADATA); 452 SET_GC_MARK(b, GC_MARK_METADATA);
424 SET_GC_MOVE(b, 0); 453 SET_GC_MOVE(b, 0);
425 b->prio = BTREE_PRIO; 454 b->prio = BTREE_PRIO;
@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
445 } 474 }
446} 475}
447 476
448int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 477int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
449 struct bkey *k, int n, bool wait) 478 struct bkey *k, int n, bool wait)
450{ 479{
451 int i; 480 int i;
@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
459 488
460 for (i = 0; i < n; i++) { 489 for (i = 0; i < n; i++) {
461 struct cache *ca = c->cache_by_alloc[i]; 490 struct cache *ca = c->cache_by_alloc[i];
462 long b = bch_bucket_alloc(ca, watermark, wait); 491 long b = bch_bucket_alloc(ca, reserve, wait);
463 492
464 if (b == -1) 493 if (b == -1)
465 goto err; 494 goto err;
@@ -478,12 +507,12 @@ err:
478 return -1; 507 return -1;
479} 508}
480 509
481int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 510int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
482 struct bkey *k, int n, bool wait) 511 struct bkey *k, int n, bool wait)
483{ 512{
484 int ret; 513 int ret;
485 mutex_lock(&c->bucket_lock); 514 mutex_lock(&c->bucket_lock);
486 ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); 515 ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
487 mutex_unlock(&c->bucket_lock); 516 mutex_unlock(&c->bucket_lock);
488 return ret; 517 return ret;
489} 518}
@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
573 602
574 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { 603 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
575 unsigned watermark = write_prio 604 unsigned watermark = write_prio
576 ? WATERMARK_MOVINGGC 605 ? RESERVE_MOVINGGC
577 : WATERMARK_NONE; 606 : RESERVE_NONE;
578 607
579 spin_unlock(&c->data_bucket_lock); 608 spin_unlock(&c->data_bucket_lock);
580 609
@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca)
689 * Then 8 for btree allocations 718 * Then 8 for btree allocations
690 * Then half for the moving garbage collector 719 * Then half for the moving garbage collector
691 */ 720 */
692 721#if 0
693 ca->watermark[WATERMARK_PRIO] = 0; 722 ca->watermark[WATERMARK_PRIO] = 0;
694 723
695 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); 724 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca)
699 728
700 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + 729 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
701 ca->watermark[WATERMARK_MOVINGGC]; 730 ca->watermark[WATERMARK_MOVINGGC];
702 731#endif
703 return 0; 732 return 0;
704} 733}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 754f43177483..a4c7306ff43d 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -187,6 +187,7 @@
187#include <linux/types.h> 187#include <linux/types.h>
188#include <linux/workqueue.h> 188#include <linux/workqueue.h>
189 189
190#include "bset.h"
190#include "util.h" 191#include "util.h"
191#include "closure.h" 192#include "closure.h"
192 193
@@ -209,7 +210,9 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
209#define GC_MARK_RECLAIMABLE 0 210#define GC_MARK_RECLAIMABLE 0
210#define GC_MARK_DIRTY 1 211#define GC_MARK_DIRTY 1
211#define GC_MARK_METADATA 2 212#define GC_MARK_METADATA 2
212BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); 213#define GC_SECTORS_USED_SIZE 13
214#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
215BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
213BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); 216BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
214 217
215#include "journal.h" 218#include "journal.h"
@@ -280,7 +283,6 @@ struct bcache_device {
280 unsigned long sectors_dirty_last; 283 unsigned long sectors_dirty_last;
281 long sectors_dirty_derivative; 284 long sectors_dirty_derivative;
282 285
283 mempool_t *unaligned_bvec;
284 struct bio_set *bio_split; 286 struct bio_set *bio_split;
285 287
286 unsigned data_csum:1; 288 unsigned data_csum:1;
@@ -310,7 +312,8 @@ struct cached_dev {
310 struct cache_sb sb; 312 struct cache_sb sb;
311 struct bio sb_bio; 313 struct bio sb_bio;
312 struct bio_vec sb_bv[1]; 314 struct bio_vec sb_bv[1];
313 struct closure_with_waitlist sb_write; 315 struct closure sb_write;
316 struct semaphore sb_write_mutex;
314 317
315 /* Refcount on the cache set. Always nonzero when we're caching. */ 318 /* Refcount on the cache set. Always nonzero when we're caching. */
316 atomic_t count; 319 atomic_t count;
@@ -383,12 +386,12 @@ struct cached_dev {
383 unsigned writeback_rate_p_term_inverse; 386 unsigned writeback_rate_p_term_inverse;
384}; 387};
385 388
386enum alloc_watermarks { 389enum alloc_reserve {
387 WATERMARK_PRIO, 390 RESERVE_BTREE,
388 WATERMARK_METADATA, 391 RESERVE_PRIO,
389 WATERMARK_MOVINGGC, 392 RESERVE_MOVINGGC,
390 WATERMARK_NONE, 393 RESERVE_NONE,
391 WATERMARK_MAX 394 RESERVE_NR,
392}; 395};
393 396
394struct cache { 397struct cache {
@@ -400,8 +403,6 @@ struct cache {
400 struct kobject kobj; 403 struct kobject kobj;
401 struct block_device *bdev; 404 struct block_device *bdev;
402 405
403 unsigned watermark[WATERMARK_MAX];
404
405 struct task_struct *alloc_thread; 406 struct task_struct *alloc_thread;
406 407
407 struct closure prio; 408 struct closure prio;
@@ -430,7 +431,7 @@ struct cache {
430 * because all the data they contained was overwritten), so we only 431 * because all the data they contained was overwritten), so we only
431 * need to discard them before they can be moved to the free list. 432 * need to discard them before they can be moved to the free list.
432 */ 433 */
433 DECLARE_FIFO(long, free); 434 DECLARE_FIFO(long, free)[RESERVE_NR];
434 DECLARE_FIFO(long, free_inc); 435 DECLARE_FIFO(long, free_inc);
435 DECLARE_FIFO(long, unused); 436 DECLARE_FIFO(long, unused);
436 437
@@ -515,7 +516,8 @@ struct cache_set {
515 uint64_t cached_dev_sectors; 516 uint64_t cached_dev_sectors;
516 struct closure caching; 517 struct closure caching;
517 518
518 struct closure_with_waitlist sb_write; 519 struct closure sb_write;
520 struct semaphore sb_write_mutex;
519 521
520 mempool_t *search; 522 mempool_t *search;
521 mempool_t *bio_meta; 523 mempool_t *bio_meta;
@@ -630,13 +632,15 @@ struct cache_set {
630 632
631#ifdef CONFIG_BCACHE_DEBUG 633#ifdef CONFIG_BCACHE_DEBUG
632 struct btree *verify_data; 634 struct btree *verify_data;
635 struct bset *verify_ondisk;
633 struct mutex verify_lock; 636 struct mutex verify_lock;
634#endif 637#endif
635 638
636 unsigned nr_uuids; 639 unsigned nr_uuids;
637 struct uuid_entry *uuids; 640 struct uuid_entry *uuids;
638 BKEY_PADDED(uuid_bucket); 641 BKEY_PADDED(uuid_bucket);
639 struct closure_with_waitlist uuid_write; 642 struct closure uuid_write;
643 struct semaphore uuid_write_mutex;
640 644
641 /* 645 /*
642 * A btree node on disk could have too many bsets for an iterator to fit 646 * A btree node on disk could have too many bsets for an iterator to fit
@@ -644,13 +648,7 @@ struct cache_set {
644 */ 648 */
645 mempool_t *fill_iter; 649 mempool_t *fill_iter;
646 650
647 /* 651 struct bset_sort_state sort;
648 * btree_sort() is a merge sort and requires temporary space - single
649 * element mempool
650 */
651 struct mutex sort_lock;
652 struct bset *sort;
653 unsigned sort_crit_factor;
654 652
655 /* List of buckets we're currently writing data to */ 653 /* List of buckets we're currently writing data to */
656 struct list_head data_buckets; 654 struct list_head data_buckets;
@@ -666,7 +664,6 @@ struct cache_set {
666 unsigned congested_read_threshold_us; 664 unsigned congested_read_threshold_us;
667 unsigned congested_write_threshold_us; 665 unsigned congested_write_threshold_us;
668 666
669 struct time_stats sort_time;
670 struct time_stats btree_gc_time; 667 struct time_stats btree_gc_time;
671 struct time_stats btree_split_time; 668 struct time_stats btree_split_time;
672 struct time_stats btree_read_time; 669 struct time_stats btree_read_time;
@@ -684,9 +681,9 @@ struct cache_set {
684 unsigned error_decay; 681 unsigned error_decay;
685 682
686 unsigned short journal_delay_ms; 683 unsigned short journal_delay_ms;
684 bool expensive_debug_checks;
687 unsigned verify:1; 685 unsigned verify:1;
688 unsigned key_merging_disabled:1; 686 unsigned key_merging_disabled:1;
689 unsigned expensive_debug_checks:1;
690 unsigned gc_always_rewrite:1; 687 unsigned gc_always_rewrite:1;
691 unsigned shrinker_disabled:1; 688 unsigned shrinker_disabled:1;
692 unsigned copy_gc_enabled:1; 689 unsigned copy_gc_enabled:1;
@@ -708,13 +705,8 @@ struct bbio {
708 struct bio bio; 705 struct bio bio;
709}; 706};
710 707
711static inline unsigned local_clock_us(void)
712{
713 return local_clock() >> 10;
714}
715
716#define BTREE_PRIO USHRT_MAX 708#define BTREE_PRIO USHRT_MAX
717#define INITIAL_PRIO 32768 709#define INITIAL_PRIO 32768U
718 710
719#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) 711#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
720#define btree_blocks(b) \ 712#define btree_blocks(b) \
@@ -727,21 +719,6 @@ static inline unsigned local_clock_us(void)
727#define bucket_bytes(c) ((c)->sb.bucket_size << 9) 719#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
728#define block_bytes(c) ((c)->sb.block_size << 9) 720#define block_bytes(c) ((c)->sb.block_size << 9)
729 721
730#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
731#define set_bytes(i) __set_bytes(i, i->keys)
732
733#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
734#define set_blocks(i, c) __set_blocks(i, (i)->keys, c)
735
736#define node(i, j) ((struct bkey *) ((i)->d + (j)))
737#define end(i) node(i, (i)->keys)
738
739#define index(i, b) \
740 ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \
741 block_bytes(b->c)))
742
743#define btree_data_space(b) (PAGE_SIZE << (b)->page_order)
744
745#define prios_per_bucket(c) \ 722#define prios_per_bucket(c) \
746 ((bucket_bytes(c) - sizeof(struct prio_set)) / \ 723 ((bucket_bytes(c) - sizeof(struct prio_set)) / \
747 sizeof(struct bucket_disk)) 724 sizeof(struct bucket_disk))
@@ -784,20 +761,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
784 return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); 761 return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
785} 762}
786 763
787/* Btree key macros */ 764static inline uint8_t gen_after(uint8_t a, uint8_t b)
765{
766 uint8_t r = a - b;
767 return r > 128U ? 0 : r;
768}
788 769
789static inline void bkey_init(struct bkey *k) 770static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
771 unsigned i)
790{ 772{
791 *k = ZERO_KEY; 773 return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
792} 774}
793 775
776static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
777 unsigned i)
778{
779 return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
780}
781
782/* Btree key macros */
783
794/* 784/*
795 * This is used for various on disk data structures - cache_sb, prio_set, bset, 785 * This is used for various on disk data structures - cache_sb, prio_set, bset,
796 * jset: The checksum is _always_ the first 8 bytes of these structs 786 * jset: The checksum is _always_ the first 8 bytes of these structs
797 */ 787 */
798#define csum_set(i) \ 788#define csum_set(i) \
799 bch_crc64(((void *) (i)) + sizeof(uint64_t), \ 789 bch_crc64(((void *) (i)) + sizeof(uint64_t), \
800 ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) 790 ((void *) bset_bkey_last(i)) - \
791 (((void *) (i)) + sizeof(uint64_t)))
801 792
802/* Error handling macros */ 793/* Error handling macros */
803 794
@@ -902,7 +893,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
902void bch_bbio_free(struct bio *, struct cache_set *); 893void bch_bbio_free(struct bio *, struct cache_set *);
903struct bio *bch_bbio_alloc(struct cache_set *); 894struct bio *bch_bbio_alloc(struct cache_set *);
904 895
905struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *);
906void bch_generic_make_request(struct bio *, struct bio_split_pool *); 896void bch_generic_make_request(struct bio *, struct bio_split_pool *);
907void __bch_submit_bbio(struct bio *, struct cache_set *); 897void __bch_submit_bbio(struct bio *, struct cache_set *);
908void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); 898void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 7d388b8bb50e..3f74b4b0747b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -5,30 +5,134 @@
5 * Copyright 2012 Google, Inc. 5 * Copyright 2012 Google, Inc.
6 */ 6 */
7 7
8#include "bcache.h" 8#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
9#include "btree.h"
10#include "debug.h"
11 9
10#include "util.h"
11#include "bset.h"
12
13#include <linux/console.h>
12#include <linux/random.h> 14#include <linux/random.h>
13#include <linux/prefetch.h> 15#include <linux/prefetch.h>
14 16
17#ifdef CONFIG_BCACHE_DEBUG
18
19void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
20{
21 struct bkey *k, *next;
22
23 for (k = i->start; k < bset_bkey_last(i); k = next) {
24 next = bkey_next(k);
25
26 printk(KERN_ERR "block %u key %li/%u: ", set,
27 (uint64_t *) k - i->d, i->keys);
28
29 if (b->ops->key_dump)
30 b->ops->key_dump(b, k);
31 else
32 printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
33
34 if (next < bset_bkey_last(i) &&
35 bkey_cmp(k, b->ops->is_extents ?
36 &START_KEY(next) : next) > 0)
37 printk(KERN_ERR "Key skipped backwards\n");
38 }
39}
40
41void bch_dump_bucket(struct btree_keys *b)
42{
43 unsigned i;
44
45 console_lock();
46 for (i = 0; i <= b->nsets; i++)
47 bch_dump_bset(b, b->set[i].data,
48 bset_sector_offset(b, b->set[i].data));
49 console_unlock();
50}
51
52int __bch_count_data(struct btree_keys *b)
53{
54 unsigned ret = 0;
55 struct btree_iter iter;
56 struct bkey *k;
57
58 if (b->ops->is_extents)
59 for_each_key(b, k, &iter)
60 ret += KEY_SIZE(k);
61 return ret;
62}
63
64void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
65{
66 va_list args;
67 struct bkey *k, *p = NULL;
68 struct btree_iter iter;
69 const char *err;
70
71 for_each_key(b, k, &iter) {
72 if (b->ops->is_extents) {
73 err = "Keys out of order";
74 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
75 goto bug;
76
77 if (bch_ptr_invalid(b, k))
78 continue;
79
80 err = "Overlapping keys";
81 if (p && bkey_cmp(p, &START_KEY(k)) > 0)
82 goto bug;
83 } else {
84 if (bch_ptr_bad(b, k))
85 continue;
86
87 err = "Duplicate keys";
88 if (p && !bkey_cmp(p, k))
89 goto bug;
90 }
91 p = k;
92 }
93#if 0
94 err = "Key larger than btree node key";
95 if (p && bkey_cmp(p, &b->key) > 0)
96 goto bug;
97#endif
98 return;
99bug:
100 bch_dump_bucket(b);
101
102 va_start(args, fmt);
103 vprintk(fmt, args);
104 va_end(args);
105
106 panic("bch_check_keys error: %s:\n", err);
107}
108
109static void bch_btree_iter_next_check(struct btree_iter *iter)
110{
111 struct bkey *k = iter->data->k, *next = bkey_next(k);
112
113 if (next < iter->data->end &&
114 bkey_cmp(k, iter->b->ops->is_extents ?
115 &START_KEY(next) : next) > 0) {
116 bch_dump_bucket(iter->b);
117 panic("Key skipped backwards\n");
118 }
119}
120
121#else
122
123static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
124
125#endif
126
15/* Keylists */ 127/* Keylists */
16 128
17int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) 129int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
18{ 130{
19 size_t oldsize = bch_keylist_nkeys(l); 131 size_t oldsize = bch_keylist_nkeys(l);
20 size_t newsize = oldsize + 2 + nptrs; 132 size_t newsize = oldsize + u64s;
21 uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; 133 uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
22 uint64_t *new_keys; 134 uint64_t *new_keys;
23 135
24 /* The journalling code doesn't handle the case where the keys to insert
25 * is bigger than an empty write: If we just return -ENOMEM here,
26 * bio_insert() and bio_invalidate() will insert the keys created so far
27 * and finish the rest when the keylist is empty.
28 */
29 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
30 return -ENOMEM;
31
32 newsize = roundup_pow_of_two(newsize); 136 newsize = roundup_pow_of_two(newsize);
33 137
34 if (newsize <= KEYLIST_INLINE || 138 if (newsize <= KEYLIST_INLINE ||
@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l)
71 bch_keylist_bytes(l)); 175 bch_keylist_bytes(l));
72} 176}
73 177
74/* Pointer validation */
75
76static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
77{
78 unsigned i;
79
80 for (i = 0; i < KEY_PTRS(k); i++)
81 if (ptr_available(c, k, i)) {
82 struct cache *ca = PTR_CACHE(c, k, i);
83 size_t bucket = PTR_BUCKET_NR(c, k, i);
84 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
85
86 if (KEY_SIZE(k) + r > c->sb.bucket_size ||
87 bucket < ca->sb.first_bucket ||
88 bucket >= ca->sb.nbuckets)
89 return true;
90 }
91
92 return false;
93}
94
95bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
96{
97 char buf[80];
98
99 if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
100 goto bad;
101
102 if (__ptr_invalid(c, k))
103 goto bad;
104
105 return false;
106bad:
107 bch_bkey_to_text(buf, sizeof(buf), k);
108 cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
109 return true;
110}
111
112bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
113{
114 char buf[80];
115
116 if (!KEY_SIZE(k))
117 return true;
118
119 if (KEY_SIZE(k) > KEY_OFFSET(k))
120 goto bad;
121
122 if (__ptr_invalid(c, k))
123 goto bad;
124
125 return false;
126bad:
127 bch_bkey_to_text(buf, sizeof(buf), k);
128 cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
129 return true;
130}
131
132static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
133 unsigned ptr)
134{
135 struct bucket *g = PTR_BUCKET(b->c, k, ptr);
136 char buf[80];
137
138 if (mutex_trylock(&b->c->bucket_lock)) {
139 if (b->level) {
140 if (KEY_DIRTY(k) ||
141 g->prio != BTREE_PRIO ||
142 (b->c->gc_mark_valid &&
143 GC_MARK(g) != GC_MARK_METADATA))
144 goto err;
145
146 } else {
147 if (g->prio == BTREE_PRIO)
148 goto err;
149
150 if (KEY_DIRTY(k) &&
151 b->c->gc_mark_valid &&
152 GC_MARK(g) != GC_MARK_DIRTY)
153 goto err;
154 }
155 mutex_unlock(&b->c->bucket_lock);
156 }
157
158 return false;
159err:
160 mutex_unlock(&b->c->bucket_lock);
161 bch_bkey_to_text(buf, sizeof(buf), k);
162 btree_bug(b,
163"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
164 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
165 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
166 return true;
167}
168
169bool bch_ptr_bad(struct btree *b, const struct bkey *k)
170{
171 struct bucket *g;
172 unsigned i, stale;
173
174 if (!bkey_cmp(k, &ZERO_KEY) ||
175 !KEY_PTRS(k) ||
176 bch_ptr_invalid(b, k))
177 return true;
178
179 for (i = 0; i < KEY_PTRS(k); i++) {
180 if (!ptr_available(b->c, k, i))
181 return true;
182
183 g = PTR_BUCKET(b->c, k, i);
184 stale = ptr_stale(b->c, k, i);
185
186 btree_bug_on(stale > 96, b,
187 "key too stale: %i, need_gc %u",
188 stale, b->c->need_gc);
189
190 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
191 b, "stale dirty pointer");
192
193 if (stale)
194 return true;
195
196 if (expensive_debug_checks(b->c) &&
197 ptr_bad_expensive_checks(b, k, i))
198 return true;
199 }
200
201 return false;
202}
203
204/* Key/pointer manipulation */ 178/* Key/pointer manipulation */
205 179
206void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, 180void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
255 return true; 229 return true;
256} 230}
257 231
258static uint64_t merge_chksums(struct bkey *l, struct bkey *r) 232/* Auxiliary search trees */
233
234/* 32 bits total: */
235#define BKEY_MID_BITS 3
236#define BKEY_EXPONENT_BITS 7
237#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
238#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
239
240struct bkey_float {
241 unsigned exponent:BKEY_EXPONENT_BITS;
242 unsigned m:BKEY_MID_BITS;
243 unsigned mantissa:BKEY_MANTISSA_BITS;
244} __packed;
245
246/*
247 * BSET_CACHELINE was originally intended to match the hardware cacheline size -
248 * it used to be 64, but I realized the lookup code would touch slightly less
249 * memory if it was 128.
250 *
251 * It definites the number of bytes (in struct bset) per struct bkey_float in
252 * the auxiliar search tree - when we're done searching the bset_float tree we
253 * have this many bytes left that we do a linear search over.
254 *
255 * Since (after level 5) every level of the bset_tree is on a new cacheline,
256 * we're touching one fewer cacheline in the bset tree in exchange for one more
257 * cacheline in the linear search - but the linear search might stop before it
258 * gets to the second cacheline.
259 */
260
261#define BSET_CACHELINE 128
262
263/* Space required for the btree node keys */
264static inline size_t btree_keys_bytes(struct btree_keys *b)
259{ 265{
260 return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & 266 return PAGE_SIZE << b->page_order;
261 ~((uint64_t)1 << 63);
262} 267}
263 268
264/* Tries to merge l and r: l should be lower than r 269static inline size_t btree_keys_cachelines(struct btree_keys *b)
265 * Returns true if we were able to merge. If we did merge, l will be the merged
266 * key, r will be untouched.
267 */
268bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
269{ 270{
270 unsigned i; 271 return btree_keys_bytes(b) / BSET_CACHELINE;
272}
271 273
272 if (key_merging_disabled(b->c)) 274/* Space required for the auxiliary search trees */
273 return false; 275static inline size_t bset_tree_bytes(struct btree_keys *b)
276{
277 return btree_keys_cachelines(b) * sizeof(struct bkey_float);
278}
274 279
275 if (KEY_PTRS(l) != KEY_PTRS(r) || 280/* Space required for the prev pointers */
276 KEY_DIRTY(l) != KEY_DIRTY(r) || 281static inline size_t bset_prev_bytes(struct btree_keys *b)
277 bkey_cmp(l, &START_KEY(r))) 282{
278 return false; 283 return btree_keys_cachelines(b) * sizeof(uint8_t);
284}
279 285
280 for (i = 0; i < KEY_PTRS(l); i++) 286/* Memory allocation */
281 if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
282 PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
283 return false;
284 287
285 /* Keys with no pointers aren't restricted to one bucket and could 288void bch_btree_keys_free(struct btree_keys *b)
286 * overflow KEY_SIZE 289{
287 */ 290 struct bset_tree *t = b->set;
288 if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
289 SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
290 SET_KEY_SIZE(l, USHRT_MAX);
291 291
292 bch_cut_front(l, r); 292 if (bset_prev_bytes(b) < PAGE_SIZE)
293 return false; 293 kfree(t->prev);
294 } 294 else
295 free_pages((unsigned long) t->prev,
296 get_order(bset_prev_bytes(b)));
295 297
296 if (KEY_CSUM(l)) { 298 if (bset_tree_bytes(b) < PAGE_SIZE)
297 if (KEY_CSUM(r)) 299 kfree(t->tree);
298 l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); 300 else
299 else 301 free_pages((unsigned long) t->tree,
300 SET_KEY_CSUM(l, 0); 302 get_order(bset_tree_bytes(b)));
301 }
302 303
303 SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); 304 free_pages((unsigned long) t->data, b->page_order);
304 SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
305 305
306 return true; 306 t->prev = NULL;
307 t->tree = NULL;
308 t->data = NULL;
309}
310EXPORT_SYMBOL(bch_btree_keys_free);
311
312int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
313{
314 struct bset_tree *t = b->set;
315
316 BUG_ON(t->data);
317
318 b->page_order = page_order;
319
320 t->data = (void *) __get_free_pages(gfp, b->page_order);
321 if (!t->data)
322 goto err;
323
324 t->tree = bset_tree_bytes(b) < PAGE_SIZE
325 ? kmalloc(bset_tree_bytes(b), gfp)
326 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
327 if (!t->tree)
328 goto err;
329
330 t->prev = bset_prev_bytes(b) < PAGE_SIZE
331 ? kmalloc(bset_prev_bytes(b), gfp)
332 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
333 if (!t->prev)
334 goto err;
335
336 return 0;
337err:
338 bch_btree_keys_free(b);
339 return -ENOMEM;
307} 340}
341EXPORT_SYMBOL(bch_btree_keys_alloc);
342
343void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
344 bool *expensive_debug_checks)
345{
346 unsigned i;
347
348 b->ops = ops;
349 b->expensive_debug_checks = expensive_debug_checks;
350 b->nsets = 0;
351 b->last_set_unwritten = 0;
352
353 /* XXX: shouldn't be needed */
354 for (i = 0; i < MAX_BSETS; i++)
355 b->set[i].size = 0;
356 /*
357 * Second loop starts at 1 because b->keys[0]->data is the memory we
358 * allocated
359 */
360 for (i = 1; i < MAX_BSETS; i++)
361 b->set[i].data = NULL;
362}
363EXPORT_SYMBOL(bch_btree_keys_init);
308 364
309/* Binary tree stuff for auxiliary search trees */ 365/* Binary tree stuff for auxiliary search trees */
310 366
@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
455 return ((void *) k - (void *) t->data) / BSET_CACHELINE; 511 return ((void *) k - (void *) t->data) / BSET_CACHELINE;
456} 512}
457 513
458static unsigned bkey_to_cacheline_offset(struct bkey *k) 514static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
515 unsigned cacheline,
516 struct bkey *k)
459{ 517{
460 return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); 518 return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
461} 519}
462 520
463static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) 521static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
504 : tree_to_prev_bkey(t, j >> ffs(j)); 562 : tree_to_prev_bkey(t, j >> ffs(j));
505 563
506 struct bkey *r = is_power_of_2(j + 1) 564 struct bkey *r = is_power_of_2(j + 1)
507 ? node(t->data, t->data->keys - bkey_u64s(&t->end)) 565 ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
508 : tree_to_bkey(t, j >> (ffz(j) + 1)); 566 : tree_to_bkey(t, j >> (ffz(j) + 1));
509 567
510 BUG_ON(m < l || m > r); 568 BUG_ON(m < l || m > r);
@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
528 f->exponent = 127; 586 f->exponent = 127;
529} 587}
530 588
531static void bset_alloc_tree(struct btree *b, struct bset_tree *t) 589static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
532{ 590{
533 if (t != b->sets) { 591 if (t != b->set) {
534 unsigned j = roundup(t[-1].size, 592 unsigned j = roundup(t[-1].size,
535 64 / sizeof(struct bkey_float)); 593 64 / sizeof(struct bkey_float));
536 594
@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
538 t->prev = t[-1].prev + j; 596 t->prev = t[-1].prev + j;
539 } 597 }
540 598
541 while (t < b->sets + MAX_BSETS) 599 while (t < b->set + MAX_BSETS)
542 t++->size = 0; 600 t++->size = 0;
543} 601}
544 602
545static void bset_build_unwritten_tree(struct btree *b) 603static void bch_bset_build_unwritten_tree(struct btree_keys *b)
546{ 604{
547 struct bset_tree *t = b->sets + b->nsets; 605 struct bset_tree *t = bset_tree_last(b);
606
607 BUG_ON(b->last_set_unwritten);
608 b->last_set_unwritten = 1;
548 609
549 bset_alloc_tree(b, t); 610 bset_alloc_tree(b, t);
550 611
551 if (t->tree != b->sets->tree + bset_tree_space(b)) { 612 if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
552 t->prev[0] = bkey_to_cacheline_offset(t->data->start); 613 t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
553 t->size = 1; 614 t->size = 1;
554 } 615 }
555} 616}
556 617
557static void bset_build_written_tree(struct btree *b) 618void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
619{
620 if (i != b->set->data) {
621 b->set[++b->nsets].data = i;
622 i->seq = b->set->data->seq;
623 } else
624 get_random_bytes(&i->seq, sizeof(uint64_t));
625
626 i->magic = magic;
627 i->version = 0;
628 i->keys = 0;
629
630 bch_bset_build_unwritten_tree(b);
631}
632EXPORT_SYMBOL(bch_bset_init_next);
633
634void bch_bset_build_written_tree(struct btree_keys *b)
558{ 635{
559 struct bset_tree *t = b->sets + b->nsets; 636 struct bset_tree *t = bset_tree_last(b);
560 struct bkey *k = t->data->start; 637 struct bkey *prev = NULL, *k = t->data->start;
561 unsigned j, cacheline = 1; 638 unsigned j, cacheline = 1;
562 639
640 b->last_set_unwritten = 0;
641
563 bset_alloc_tree(b, t); 642 bset_alloc_tree(b, t);
564 643
565 t->size = min_t(unsigned, 644 t->size = min_t(unsigned,
566 bkey_to_cacheline(t, end(t->data)), 645 bkey_to_cacheline(t, bset_bkey_last(t->data)),
567 b->sets->tree + bset_tree_space(b) - t->tree); 646 b->set->tree + btree_keys_cachelines(b) - t->tree);
568 647
569 if (t->size < 2) { 648 if (t->size < 2) {
570 t->size = 0; 649 t->size = 0;
@@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b)
577 for (j = inorder_next(0, t->size); 656 for (j = inorder_next(0, t->size);
578 j; 657 j;
579 j = inorder_next(j, t->size)) { 658 j = inorder_next(j, t->size)) {
580 while (bkey_to_cacheline(t, k) != cacheline) 659 while (bkey_to_cacheline(t, k) < cacheline)
581 k = bkey_next(k); 660 prev = k, k = bkey_next(k);
582 661
583 t->prev[j] = bkey_u64s(k); 662 t->prev[j] = bkey_u64s(prev);
584 k = bkey_next(k); 663 t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
585 cacheline++;
586 t->tree[j].m = bkey_to_cacheline_offset(k);
587 } 664 }
588 665
589 while (bkey_next(k) != end(t->data)) 666 while (bkey_next(k) != bset_bkey_last(t->data))
590 k = bkey_next(k); 667 k = bkey_next(k);
591 668
592 t->end = *k; 669 t->end = *k;
@@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b)
597 j = inorder_next(j, t->size)) 674 j = inorder_next(j, t->size))
598 make_bfloat(t, j); 675 make_bfloat(t, j);
599} 676}
677EXPORT_SYMBOL(bch_bset_build_written_tree);
600 678
601void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) 679/* Insert */
680
681void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
602{ 682{
603 struct bset_tree *t; 683 struct bset_tree *t;
604 unsigned inorder, j = 1; 684 unsigned inorder, j = 1;
605 685
606 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 686 for (t = b->set; t <= bset_tree_last(b); t++)
607 if (k < end(t->data)) 687 if (k < bset_bkey_last(t->data))
608 goto found_set; 688 goto found_set;
609 689
610 BUG(); 690 BUG();
@@ -617,7 +697,7 @@ found_set:
617 if (k == t->data->start) 697 if (k == t->data->start)
618 goto fix_left; 698 goto fix_left;
619 699
620 if (bkey_next(k) == end(t->data)) { 700 if (bkey_next(k) == bset_bkey_last(t->data)) {
621 t->end = *k; 701 t->end = *k;
622 goto fix_right; 702 goto fix_right;
623 } 703 }
@@ -642,10 +722,12 @@ fix_right: do {
642 j = j * 2 + 1; 722 j = j * 2 + 1;
643 } while (j < t->size); 723 } while (j < t->size);
644} 724}
725EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
645 726
646void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) 727static void bch_bset_fix_lookup_table(struct btree_keys *b,
728 struct bset_tree *t,
729 struct bkey *k)
647{ 730{
648 struct bset_tree *t = &b->sets[b->nsets];
649 unsigned shift = bkey_u64s(k); 731 unsigned shift = bkey_u64s(k);
650 unsigned j = bkey_to_cacheline(t, k); 732 unsigned j = bkey_to_cacheline(t, k);
651 733
@@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
657 * lookup table for the first key that is strictly greater than k: 739 * lookup table for the first key that is strictly greater than k:
658 * it's either k's cacheline or the next one 740 * it's either k's cacheline or the next one
659 */ 741 */
660 if (j < t->size && 742 while (j < t->size &&
661 table_to_bkey(t, j) <= k) 743 table_to_bkey(t, j) <= k)
662 j++; 744 j++;
663 745
664 /* Adjust all the lookup table entries, and find a new key for any that 746 /* Adjust all the lookup table entries, and find a new key for any that
@@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
673 while (k < cacheline_to_bkey(t, j, 0)) 755 while (k < cacheline_to_bkey(t, j, 0))
674 k = bkey_next(k); 756 k = bkey_next(k);
675 757
676 t->prev[j] = bkey_to_cacheline_offset(k); 758 t->prev[j] = bkey_to_cacheline_offset(t, j, k);
677 } 759 }
678 } 760 }
679 761
680 if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) 762 if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
681 return; 763 return;
682 764
683 /* Possibly add a new entry to the end of the lookup table */ 765 /* Possibly add a new entry to the end of the lookup table */
684 766
685 for (k = table_to_bkey(t, t->size - 1); 767 for (k = table_to_bkey(t, t->size - 1);
686 k != end(t->data); 768 k != bset_bkey_last(t->data);
687 k = bkey_next(k)) 769 k = bkey_next(k))
688 if (t->size == bkey_to_cacheline(t, k)) { 770 if (t->size == bkey_to_cacheline(t, k)) {
689 t->prev[t->size] = bkey_to_cacheline_offset(k); 771 t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
690 t->size++; 772 t->size++;
691 } 773 }
692} 774}
693 775
694void bch_bset_init_next(struct btree *b) 776/*
777 * Tries to merge l and r: l should be lower than r
778 * Returns true if we were able to merge. If we did merge, l will be the merged
779 * key, r will be untouched.
780 */
781bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
695{ 782{
696 struct bset *i = write_block(b); 783 if (!b->ops->key_merge)
784 return false;
697 785
698 if (i != b->sets[0].data) { 786 /*
699 b->sets[++b->nsets].data = i; 787 * Generic header checks
700 i->seq = b->sets[0].data->seq; 788 * Assumes left and right are in order
701 } else 789 * Left and right must be exactly aligned
702 get_random_bytes(&i->seq, sizeof(uint64_t)); 790 */
791 if (!bch_bkey_equal_header(l, r) ||
792 bkey_cmp(l, &START_KEY(r)))
793 return false;
703 794
704 i->magic = bset_magic(&b->c->sb); 795 return b->ops->key_merge(b, l, r);
705 i->version = 0; 796}
706 i->keys = 0; 797EXPORT_SYMBOL(bch_bkey_try_merge);
798
799void bch_bset_insert(struct btree_keys *b, struct bkey *where,
800 struct bkey *insert)
801{
802 struct bset_tree *t = bset_tree_last(b);
803
804 BUG_ON(!b->last_set_unwritten);
805 BUG_ON(bset_byte_offset(b, t->data) +
806 __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
807 PAGE_SIZE << b->page_order);
808
809 memmove((uint64_t *) where + bkey_u64s(insert),
810 where,
811 (void *) bset_bkey_last(t->data) - (void *) where);
812
813 t->data->keys += bkey_u64s(insert);
814 bkey_copy(where, insert);
815 bch_bset_fix_lookup_table(b, t, where);
816}
817EXPORT_SYMBOL(bch_bset_insert);
818
819unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
820 struct bkey *replace_key)
821{
822 unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
823 struct bset *i = bset_tree_last(b)->data;
824 struct bkey *m, *prev = NULL;
825 struct btree_iter iter;
826
827 BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
828
829 m = bch_btree_iter_init(b, &iter, b->ops->is_extents
830 ? PRECEDING_KEY(&START_KEY(k))
831 : PRECEDING_KEY(k));
832
833 if (b->ops->insert_fixup(b, k, &iter, replace_key))
834 return status;
707 835
708 bset_build_unwritten_tree(b); 836 status = BTREE_INSERT_STATUS_INSERT;
837
838 while (m != bset_bkey_last(i) &&
839 bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
840 prev = m, m = bkey_next(m);
841
842 /* prev is in the tree, if we merge we're done */
843 status = BTREE_INSERT_STATUS_BACK_MERGE;
844 if (prev &&
845 bch_bkey_try_merge(b, prev, k))
846 goto merged;
847#if 0
848 status = BTREE_INSERT_STATUS_OVERWROTE;
849 if (m != bset_bkey_last(i) &&
850 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
851 goto copy;
852#endif
853 status = BTREE_INSERT_STATUS_FRONT_MERGE;
854 if (m != bset_bkey_last(i) &&
855 bch_bkey_try_merge(b, k, m))
856 goto copy;
857
858 bch_bset_insert(b, m, k);
859copy: bkey_copy(m, k);
860merged:
861 return status;
709} 862}
863EXPORT_SYMBOL(bch_btree_insert_key);
864
865/* Lookup */
710 866
711struct bset_search_iter { 867struct bset_search_iter {
712 struct bkey *l, *r; 868 struct bkey *l, *r;
713}; 869};
714 870
715static struct bset_search_iter bset_search_write_set(struct btree *b, 871static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
716 struct bset_tree *t,
717 const struct bkey *search) 872 const struct bkey *search)
718{ 873{
719 unsigned li = 0, ri = t->size; 874 unsigned li = 0, ri = t->size;
720 875
721 BUG_ON(!b->nsets &&
722 t->size < bkey_to_cacheline(t, end(t->data)));
723
724 while (li + 1 != ri) { 876 while (li + 1 != ri) {
725 unsigned m = (li + ri) >> 1; 877 unsigned m = (li + ri) >> 1;
726 878
@@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b,
732 884
733 return (struct bset_search_iter) { 885 return (struct bset_search_iter) {
734 table_to_bkey(t, li), 886 table_to_bkey(t, li),
735 ri < t->size ? table_to_bkey(t, ri) : end(t->data) 887 ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
736 }; 888 };
737} 889}
738 890
739static struct bset_search_iter bset_search_tree(struct btree *b, 891static struct bset_search_iter bset_search_tree(struct bset_tree *t,
740 struct bset_tree *t,
741 const struct bkey *search) 892 const struct bkey *search)
742{ 893{
743 struct bkey *l, *r; 894 struct bkey *l, *r;
@@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
784 f = &t->tree[inorder_next(j, t->size)]; 935 f = &t->tree[inorder_next(j, t->size)];
785 r = cacheline_to_bkey(t, inorder, f->m); 936 r = cacheline_to_bkey(t, inorder, f->m);
786 } else 937 } else
787 r = end(t->data); 938 r = bset_bkey_last(t->data);
788 } else { 939 } else {
789 r = cacheline_to_bkey(t, inorder, f->m); 940 r = cacheline_to_bkey(t, inorder, f->m);
790 941
@@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
798 return (struct bset_search_iter) {l, r}; 949 return (struct bset_search_iter) {l, r};
799} 950}
800 951
801struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, 952struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
802 const struct bkey *search) 953 const struct bkey *search)
803{ 954{
804 struct bset_search_iter i; 955 struct bset_search_iter i;
@@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
820 971
821 if (unlikely(!t->size)) { 972 if (unlikely(!t->size)) {
822 i.l = t->data->start; 973 i.l = t->data->start;
823 i.r = end(t->data); 974 i.r = bset_bkey_last(t->data);
824 } else if (bset_written(b, t)) { 975 } else if (bset_written(b, t)) {
825 /* 976 /*
826 * Each node in the auxiliary search tree covers a certain range 977 * Each node in the auxiliary search tree covers a certain range
@@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
830 */ 981 */
831 982
832 if (unlikely(bkey_cmp(search, &t->end) >= 0)) 983 if (unlikely(bkey_cmp(search, &t->end) >= 0))
833 return end(t->data); 984 return bset_bkey_last(t->data);
834 985
835 if (unlikely(bkey_cmp(search, t->data->start) < 0)) 986 if (unlikely(bkey_cmp(search, t->data->start) < 0))
836 return t->data->start; 987 return t->data->start;
837 988
838 i = bset_search_tree(b, t, search); 989 i = bset_search_tree(t, search);
839 } else 990 } else {
840 i = bset_search_write_set(b, t, search); 991 BUG_ON(!b->nsets &&
992 t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
841 993
842 if (expensive_debug_checks(b->c)) { 994 i = bset_search_write_set(t, search);
995 }
996
997 if (btree_keys_expensive_checks(b)) {
843 BUG_ON(bset_written(b, t) && 998 BUG_ON(bset_written(b, t) &&
844 i.l != t->data->start && 999 i.l != t->data->start &&
845 bkey_cmp(tree_to_prev_bkey(t, 1000 bkey_cmp(tree_to_prev_bkey(t,
846 inorder_to_tree(bkey_to_cacheline(t, i.l), t)), 1001 inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
847 search) > 0); 1002 search) > 0);
848 1003
849 BUG_ON(i.r != end(t->data) && 1004 BUG_ON(i.r != bset_bkey_last(t->data) &&
850 bkey_cmp(i.r, search) <= 0); 1005 bkey_cmp(i.r, search) <= 0);
851 } 1006 }
852 1007
@@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
856 1011
857 return i.l; 1012 return i.l;
858} 1013}
1014EXPORT_SYMBOL(__bch_bset_search);
859 1015
860/* Btree iterator */ 1016/* Btree iterator */
861 1017
862/* 1018typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
863 * Returns true if l > r - unless l == r, in which case returns true if l is 1019 struct btree_iter_set);
864 * older than r. 1020
865 *
866 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
867 * equal in different sets, we have to process them newest to oldest.
868 */
869static inline bool btree_iter_cmp(struct btree_iter_set l, 1021static inline bool btree_iter_cmp(struct btree_iter_set l,
870 struct btree_iter_set r) 1022 struct btree_iter_set r)
871{ 1023{
872 int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); 1024 return bkey_cmp(l.k, r.k) > 0;
873
874 return c ? c > 0 : l.k < r.k;
875} 1025}
876 1026
877static inline bool btree_iter_end(struct btree_iter *iter) 1027static inline bool btree_iter_end(struct btree_iter *iter)
@@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
888 btree_iter_cmp)); 1038 btree_iter_cmp));
889} 1039}
890 1040
891struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, 1041static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
892 struct bkey *search, struct bset_tree *start) 1042 struct btree_iter *iter,
1043 struct bkey *search,
1044 struct bset_tree *start)
893{ 1045{
894 struct bkey *ret = NULL; 1046 struct bkey *ret = NULL;
895 iter->size = ARRAY_SIZE(iter->data); 1047 iter->size = ARRAY_SIZE(iter->data);
@@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
899 iter->b = b; 1051 iter->b = b;
900#endif 1052#endif
901 1053
902 for (; start <= &b->sets[b->nsets]; start++) { 1054 for (; start <= bset_tree_last(b); start++) {
903 ret = bch_bset_search(b, start, search); 1055 ret = bch_bset_search(b, start, search);
904 bch_btree_iter_push(iter, ret, end(start->data)); 1056 bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
905 } 1057 }
906 1058
907 return ret; 1059 return ret;
908} 1060}
909 1061
910struct bkey *bch_btree_iter_next(struct btree_iter *iter) 1062struct bkey *bch_btree_iter_init(struct btree_keys *b,
1063 struct btree_iter *iter,
1064 struct bkey *search)
1065{
1066 return __bch_btree_iter_init(b, iter, search, b->set);
1067}
1068EXPORT_SYMBOL(bch_btree_iter_init);
1069
1070static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
1071 btree_iter_cmp_fn *cmp)
911{ 1072{
912 struct btree_iter_set unused; 1073 struct btree_iter_set unused;
913 struct bkey *ret = NULL; 1074 struct bkey *ret = NULL;
@@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
924 } 1085 }
925 1086
926 if (iter->data->k == iter->data->end) 1087 if (iter->data->k == iter->data->end)
927 heap_pop(iter, unused, btree_iter_cmp); 1088 heap_pop(iter, unused, cmp);
928 else 1089 else
929 heap_sift(iter, 0, btree_iter_cmp); 1090 heap_sift(iter, 0, cmp);
930 } 1091 }
931 1092
932 return ret; 1093 return ret;
933} 1094}
934 1095
1096struct bkey *bch_btree_iter_next(struct btree_iter *iter)
1097{
1098 return __bch_btree_iter_next(iter, btree_iter_cmp);
1099
1100}
1101EXPORT_SYMBOL(bch_btree_iter_next);
1102
935struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, 1103struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
936 struct btree *b, ptr_filter_fn fn) 1104 struct btree_keys *b, ptr_filter_fn fn)
937{ 1105{
938 struct bkey *ret; 1106 struct bkey *ret;
939 1107
@@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
946 1114
947/* Mergesort */ 1115/* Mergesort */
948 1116
949static void sort_key_next(struct btree_iter *iter, 1117void bch_bset_sort_state_free(struct bset_sort_state *state)
950 struct btree_iter_set *i)
951{ 1118{
952 i->k = bkey_next(i->k); 1119 if (state->pool)
953 1120 mempool_destroy(state->pool);
954 if (i->k == i->end)
955 *i = iter->data[--iter->used];
956} 1121}
957 1122
958static void btree_sort_fixup(struct btree_iter *iter) 1123int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
959{ 1124{
960 while (iter->used > 1) { 1125 spin_lock_init(&state->time.lock);
961 struct btree_iter_set *top = iter->data, *i = top + 1;
962
963 if (iter->used > 2 &&
964 btree_iter_cmp(i[0], i[1]))
965 i++;
966
967 if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
968 break;
969 1126
970 if (!KEY_SIZE(i->k)) { 1127 state->page_order = page_order;
971 sort_key_next(iter, i); 1128 state->crit_factor = int_sqrt(1 << page_order);
972 heap_sift(iter, i - top, btree_iter_cmp);
973 continue;
974 }
975 1129
976 if (top->k > i->k) { 1130 state->pool = mempool_create_page_pool(1, page_order);
977 if (bkey_cmp(top->k, i->k) >= 0) 1131 if (!state->pool)
978 sort_key_next(iter, i); 1132 return -ENOMEM;
979 else
980 bch_cut_front(top->k, i->k);
981 1133
982 heap_sift(iter, i - top, btree_iter_cmp); 1134 return 0;
983 } else {
984 /* can't happen because of comparison func */
985 BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
986 bch_cut_back(&START_KEY(i->k), top->k);
987 }
988 }
989} 1135}
1136EXPORT_SYMBOL(bch_bset_sort_state_init);
990 1137
991static void btree_mergesort(struct btree *b, struct bset *out, 1138static void btree_mergesort(struct btree_keys *b, struct bset *out,
992 struct btree_iter *iter, 1139 struct btree_iter *iter,
993 bool fixup, bool remove_stale) 1140 bool fixup, bool remove_stale)
994{ 1141{
1142 int i;
995 struct bkey *k, *last = NULL; 1143 struct bkey *k, *last = NULL;
996 bool (*bad)(struct btree *, const struct bkey *) = remove_stale 1144 BKEY_PADDED(k) tmp;
1145 bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
997 ? bch_ptr_bad 1146 ? bch_ptr_bad
998 : bch_ptr_invalid; 1147 : bch_ptr_invalid;
999 1148
1149 /* Heapify the iterator, using our comparison function */
1150 for (i = iter->used / 2 - 1; i >= 0; --i)
1151 heap_sift(iter, i, b->ops->sort_cmp);
1152
1000 while (!btree_iter_end(iter)) { 1153 while (!btree_iter_end(iter)) {
1001 if (fixup && !b->level) 1154 if (b->ops->sort_fixup && fixup)
1002 btree_sort_fixup(iter); 1155 k = b->ops->sort_fixup(iter, &tmp.k);
1156 else
1157 k = NULL;
1158
1159 if (!k)
1160 k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
1003 1161
1004 k = bch_btree_iter_next(iter);
1005 if (bad(b, k)) 1162 if (bad(b, k))
1006 continue; 1163 continue;
1007 1164
1008 if (!last) { 1165 if (!last) {
1009 last = out->start; 1166 last = out->start;
1010 bkey_copy(last, k); 1167 bkey_copy(last, k);
1011 } else if (b->level || 1168 } else if (!bch_bkey_try_merge(b, last, k)) {
1012 !bch_bkey_try_merge(b, last, k)) {
1013 last = bkey_next(last); 1169 last = bkey_next(last);
1014 bkey_copy(last, k); 1170 bkey_copy(last, k);
1015 } 1171 }
@@ -1020,27 +1176,30 @@ static void btree_mergesort(struct btree *b, struct bset *out,
1020 pr_debug("sorted %i keys", out->keys); 1176 pr_debug("sorted %i keys", out->keys);
1021} 1177}
1022 1178
1023static void __btree_sort(struct btree *b, struct btree_iter *iter, 1179static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
1024 unsigned start, unsigned order, bool fixup) 1180 unsigned start, unsigned order, bool fixup,
1181 struct bset_sort_state *state)
1025{ 1182{
1026 uint64_t start_time; 1183 uint64_t start_time;
1027 bool remove_stale = !b->written; 1184 bool used_mempool = false;
1028 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, 1185 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
1029 order); 1186 order);
1030 if (!out) { 1187 if (!out) {
1031 mutex_lock(&b->c->sort_lock); 1188 struct page *outp;
1032 out = b->c->sort; 1189
1033 order = ilog2(bucket_pages(b->c)); 1190 BUG_ON(order > state->page_order);
1191
1192 outp = mempool_alloc(state->pool, GFP_NOIO);
1193 out = page_address(outp);
1194 used_mempool = true;
1195 order = state->page_order;
1034 } 1196 }
1035 1197
1036 start_time = local_clock(); 1198 start_time = local_clock();
1037 1199
1038 btree_mergesort(b, out, iter, fixup, remove_stale); 1200 btree_mergesort(b, out, iter, fixup, false);
1039 b->nsets = start; 1201 b->nsets = start;
1040 1202
1041 if (!fixup && !start && b->written)
1042 bch_btree_verify(b, out);
1043
1044 if (!start && order == b->page_order) { 1203 if (!start && order == b->page_order) {
1045 /* 1204 /*
1046 * Our temporary buffer is the same size as the btree node's 1205 * Our temporary buffer is the same size as the btree node's
@@ -1048,84 +1207,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
1048 * memcpy() 1207 * memcpy()
1049 */ 1208 */
1050 1209
1051 out->magic = bset_magic(&b->c->sb); 1210 out->magic = b->set->data->magic;
1052 out->seq = b->sets[0].data->seq; 1211 out->seq = b->set->data->seq;
1053 out->version = b->sets[0].data->version; 1212 out->version = b->set->data->version;
1054 swap(out, b->sets[0].data); 1213 swap(out, b->set->data);
1055
1056 if (b->c->sort == b->sets[0].data)
1057 b->c->sort = out;
1058 } else { 1214 } else {
1059 b->sets[start].data->keys = out->keys; 1215 b->set[start].data->keys = out->keys;
1060 memcpy(b->sets[start].data->start, out->start, 1216 memcpy(b->set[start].data->start, out->start,
1061 (void *) end(out) - (void *) out->start); 1217 (void *) bset_bkey_last(out) - (void *) out->start);
1062 } 1218 }
1063 1219
1064 if (out == b->c->sort) 1220 if (used_mempool)
1065 mutex_unlock(&b->c->sort_lock); 1221 mempool_free(virt_to_page(out), state->pool);
1066 else 1222 else
1067 free_pages((unsigned long) out, order); 1223 free_pages((unsigned long) out, order);
1068 1224
1069 if (b->written) 1225 bch_bset_build_written_tree(b);
1070 bset_build_written_tree(b);
1071 1226
1072 if (!start) 1227 if (!start)
1073 bch_time_stats_update(&b->c->sort_time, start_time); 1228 bch_time_stats_update(&state->time, start_time);
1074} 1229}
1075 1230
1076void bch_btree_sort_partial(struct btree *b, unsigned start) 1231void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
1232 struct bset_sort_state *state)
1077{ 1233{
1078 size_t order = b->page_order, keys = 0; 1234 size_t order = b->page_order, keys = 0;
1079 struct btree_iter iter; 1235 struct btree_iter iter;
1080 int oldsize = bch_count_data(b); 1236 int oldsize = bch_count_data(b);
1081 1237
1082 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); 1238 __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
1083
1084 BUG_ON(b->sets[b->nsets].data == write_block(b) &&
1085 (b->sets[b->nsets].size || b->nsets));
1086
1087 1239
1088 if (start) { 1240 if (start) {
1089 unsigned i; 1241 unsigned i;
1090 1242
1091 for (i = start; i <= b->nsets; i++) 1243 for (i = start; i <= b->nsets; i++)
1092 keys += b->sets[i].data->keys; 1244 keys += b->set[i].data->keys;
1093 1245
1094 order = roundup_pow_of_two(__set_bytes(b->sets->data, 1246 order = get_order(__set_bytes(b->set->data, keys));
1095 keys)) / PAGE_SIZE;
1096 if (order)
1097 order = ilog2(order);
1098 } 1247 }
1099 1248
1100 __btree_sort(b, &iter, start, order, false); 1249 __btree_sort(b, &iter, start, order, false, state);
1101 1250
1102 EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); 1251 EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
1103} 1252}
1253EXPORT_SYMBOL(bch_btree_sort_partial);
1104 1254
1105void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) 1255void bch_btree_sort_and_fix_extents(struct btree_keys *b,
1256 struct btree_iter *iter,
1257 struct bset_sort_state *state)
1106{ 1258{
1107 BUG_ON(!b->written); 1259 __btree_sort(b, iter, 0, b->page_order, true, state);
1108 __btree_sort(b, iter, 0, b->page_order, true);
1109} 1260}
1110 1261
1111void bch_btree_sort_into(struct btree *b, struct btree *new) 1262void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
1263 struct bset_sort_state *state)
1112{ 1264{
1113 uint64_t start_time = local_clock(); 1265 uint64_t start_time = local_clock();
1114 1266
1115 struct btree_iter iter; 1267 struct btree_iter iter;
1116 bch_btree_iter_init(b, &iter, NULL); 1268 bch_btree_iter_init(b, &iter, NULL);
1117 1269
1118 btree_mergesort(b, new->sets->data, &iter, false, true); 1270 btree_mergesort(b, new->set->data, &iter, false, true);
1119 1271
1120 bch_time_stats_update(&b->c->sort_time, start_time); 1272 bch_time_stats_update(&state->time, start_time);
1121 1273
1122 bkey_copy_key(&new->key, &b->key); 1274 new->set->size = 0; // XXX: why?
1123 new->sets->size = 0;
1124} 1275}
1125 1276
1126#define SORT_CRIT (4096 / sizeof(uint64_t)) 1277#define SORT_CRIT (4096 / sizeof(uint64_t))
1127 1278
1128void bch_btree_sort_lazy(struct btree *b) 1279void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
1129{ 1280{
1130 unsigned crit = SORT_CRIT; 1281 unsigned crit = SORT_CRIT;
1131 int i; 1282 int i;
@@ -1134,50 +1285,32 @@ void bch_btree_sort_lazy(struct btree *b)
1134 if (!b->nsets) 1285 if (!b->nsets)
1135 goto out; 1286 goto out;
1136 1287
1137 /* If not a leaf node, always sort */
1138 if (b->level) {
1139 bch_btree_sort(b);
1140 return;
1141 }
1142
1143 for (i = b->nsets - 1; i >= 0; --i) { 1288 for (i = b->nsets - 1; i >= 0; --i) {
1144 crit *= b->c->sort_crit_factor; 1289 crit *= state->crit_factor;
1145 1290
1146 if (b->sets[i].data->keys < crit) { 1291 if (b->set[i].data->keys < crit) {
1147 bch_btree_sort_partial(b, i); 1292 bch_btree_sort_partial(b, i, state);
1148 return; 1293 return;
1149 } 1294 }
1150 } 1295 }
1151 1296
1152 /* Sort if we'd overflow */ 1297 /* Sort if we'd overflow */
1153 if (b->nsets + 1 == MAX_BSETS) { 1298 if (b->nsets + 1 == MAX_BSETS) {
1154 bch_btree_sort(b); 1299 bch_btree_sort(b, state);
1155 return; 1300 return;
1156 } 1301 }
1157 1302
1158out: 1303out:
1159 bset_build_written_tree(b); 1304 bch_bset_build_written_tree(b);
1160} 1305}
1306EXPORT_SYMBOL(bch_btree_sort_lazy);
1161 1307
1162/* Sysfs stuff */ 1308void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
1163
1164struct bset_stats {
1165 struct btree_op op;
1166 size_t nodes;
1167 size_t sets_written, sets_unwritten;
1168 size_t bytes_written, bytes_unwritten;
1169 size_t floats, failed;
1170};
1171
1172static int btree_bset_stats(struct btree_op *op, struct btree *b)
1173{ 1309{
1174 struct bset_stats *stats = container_of(op, struct bset_stats, op);
1175 unsigned i; 1310 unsigned i;
1176 1311
1177 stats->nodes++;
1178
1179 for (i = 0; i <= b->nsets; i++) { 1312 for (i = 0; i <= b->nsets; i++) {
1180 struct bset_tree *t = &b->sets[i]; 1313 struct bset_tree *t = &b->set[i];
1181 size_t bytes = t->data->keys * sizeof(uint64_t); 1314 size_t bytes = t->data->keys * sizeof(uint64_t);
1182 size_t j; 1315 size_t j;
1183 1316
@@ -1195,32 +1328,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b)
1195 stats->bytes_unwritten += bytes; 1328 stats->bytes_unwritten += bytes;
1196 } 1329 }
1197 } 1330 }
1198
1199 return MAP_CONTINUE;
1200}
1201
1202int bch_bset_print_stats(struct cache_set *c, char *buf)
1203{
1204 struct bset_stats t;
1205 int ret;
1206
1207 memset(&t, 0, sizeof(struct bset_stats));
1208 bch_btree_op_init(&t.op, -1);
1209
1210 ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
1211 if (ret < 0)
1212 return ret;
1213
1214 return snprintf(buf, PAGE_SIZE,
1215 "btree nodes: %zu\n"
1216 "written sets: %zu\n"
1217 "unwritten sets: %zu\n"
1218 "written key bytes: %zu\n"
1219 "unwritten key bytes: %zu\n"
1220 "floats: %zu\n"
1221 "failed: %zu\n",
1222 t.nodes,
1223 t.sets_written, t.sets_unwritten,
1224 t.bytes_written, t.bytes_unwritten,
1225 t.floats, t.failed);
1226} 1331}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 1d3c24f9fa0e..003260f4ddf6 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,7 +1,11 @@
1#ifndef _BCACHE_BSET_H 1#ifndef _BCACHE_BSET_H
2#define _BCACHE_BSET_H 2#define _BCACHE_BSET_H
3 3
4#include <linux/slab.h> 4#include <linux/bcache.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7
8#include "util.h" /* for time_stats */
5 9
6/* 10/*
7 * BKEYS: 11 * BKEYS:
@@ -142,20 +146,13 @@
142 * first key in that range of bytes again. 146 * first key in that range of bytes again.
143 */ 147 */
144 148
145/* Btree key comparison/iteration */ 149struct btree_keys;
150struct btree_iter;
151struct btree_iter_set;
152struct bkey_float;
146 153
147#define MAX_BSETS 4U 154#define MAX_BSETS 4U
148 155
149struct btree_iter {
150 size_t size, used;
151#ifdef CONFIG_BCACHE_DEBUG
152 struct btree *b;
153#endif
154 struct btree_iter_set {
155 struct bkey *k, *end;
156 } data[MAX_BSETS];
157};
158
159struct bset_tree { 156struct bset_tree {
160 /* 157 /*
161 * We construct a binary tree in an array as if the array 158 * We construct a binary tree in an array as if the array
@@ -165,14 +162,14 @@ struct bset_tree {
165 */ 162 */
166 163
167 /* size of the binary tree and prev array */ 164 /* size of the binary tree and prev array */
168 unsigned size; 165 unsigned size;
169 166
170 /* function of size - precalculated for to_inorder() */ 167 /* function of size - precalculated for to_inorder() */
171 unsigned extra; 168 unsigned extra;
172 169
173 /* copy of the last key in the set */ 170 /* copy of the last key in the set */
174 struct bkey end; 171 struct bkey end;
175 struct bkey_float *tree; 172 struct bkey_float *tree;
176 173
177 /* 174 /*
178 * The nodes in the bset tree point to specific keys - this 175 * The nodes in the bset tree point to specific keys - this
@@ -182,12 +179,219 @@ struct bset_tree {
182 * to keep bkey_float to 4 bytes and prev isn't used in the fast 179 * to keep bkey_float to 4 bytes and prev isn't used in the fast
183 * path. 180 * path.
184 */ 181 */
185 uint8_t *prev; 182 uint8_t *prev;
186 183
187 /* The actual btree node, with pointers to each sorted set */ 184 /* The actual btree node, with pointers to each sorted set */
188 struct bset *data; 185 struct bset *data;
186};
187
188struct btree_keys_ops {
189 bool (*sort_cmp)(struct btree_iter_set,
190 struct btree_iter_set);
191 struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *);
192 bool (*insert_fixup)(struct btree_keys *, struct bkey *,
193 struct btree_iter *, struct bkey *);
194 bool (*key_invalid)(struct btree_keys *,
195 const struct bkey *);
196 bool (*key_bad)(struct btree_keys *, const struct bkey *);
197 bool (*key_merge)(struct btree_keys *,
198 struct bkey *, struct bkey *);
199 void (*key_to_text)(char *, size_t, const struct bkey *);
200 void (*key_dump)(struct btree_keys *, const struct bkey *);
201
202 /*
203 * Only used for deciding whether to use START_KEY(k) or just the key
204 * itself in a couple places
205 */
206 bool is_extents;
207};
208
209struct btree_keys {
210 const struct btree_keys_ops *ops;
211 uint8_t page_order;
212 uint8_t nsets;
213 unsigned last_set_unwritten:1;
214 bool *expensive_debug_checks;
215
216 /*
217 * Sets of sorted keys - the real btree node - plus a binary search tree
218 *
219 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
220 * to the memory we have allocated for this btree node. Additionally,
221 * set[0]->data points to the entire btree node as it exists on disk.
222 */
223 struct bset_tree set[MAX_BSETS];
224};
225
226static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
227{
228 return b->set + b->nsets;
229}
230
231static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
232{
233 return t <= b->set + b->nsets - b->last_set_unwritten;
234}
235
236static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
237{
238 return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
239}
240
241static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
242{
243 return ((size_t) i) - ((size_t) b->set->data);
244}
245
246static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
247{
248 return bset_byte_offset(b, i) >> 9;
249}
250
251#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
252#define set_bytes(i) __set_bytes(i, i->keys)
253
254#define __set_blocks(i, k, block_bytes) \
255 DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
256#define set_blocks(i, block_bytes) \
257 __set_blocks(i, (i)->keys, block_bytes)
258
259static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
260{
261 struct bset_tree *t = bset_tree_last(b);
262
263 BUG_ON((PAGE_SIZE << b->page_order) <
264 (bset_byte_offset(b, t->data) + set_bytes(t->data)));
265
266 if (!b->last_set_unwritten)
267 return 0;
268
269 return ((PAGE_SIZE << b->page_order) -
270 (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
271 sizeof(u64);
272}
273
274static inline struct bset *bset_next_set(struct btree_keys *b,
275 unsigned block_bytes)
276{
277 struct bset *i = bset_tree_last(b)->data;
278
279 return ((void *) i) + roundup(set_bytes(i), block_bytes);
280}
281
282void bch_btree_keys_free(struct btree_keys *);
283int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
284void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
285 bool *);
286
287void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
288void bch_bset_build_written_tree(struct btree_keys *);
289void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
290bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
291void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
292unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
293 struct bkey *);
294
295enum {
296 BTREE_INSERT_STATUS_NO_INSERT = 0,
297 BTREE_INSERT_STATUS_INSERT,
298 BTREE_INSERT_STATUS_BACK_MERGE,
299 BTREE_INSERT_STATUS_OVERWROTE,
300 BTREE_INSERT_STATUS_FRONT_MERGE,
189}; 301};
190 302
303/* Btree key iteration */
304
305struct btree_iter {
306 size_t size, used;
307#ifdef CONFIG_BCACHE_DEBUG
308 struct btree_keys *b;
309#endif
310 struct btree_iter_set {
311 struct bkey *k, *end;
312 } data[MAX_BSETS];
313};
314
315typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
316
317struct bkey *bch_btree_iter_next(struct btree_iter *);
318struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
319 struct btree_keys *, ptr_filter_fn);
320
321void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
322struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
323 struct bkey *);
324
325struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
326 const struct bkey *);
327
328/*
329 * Returns the first key that is strictly greater than search
330 */
331static inline struct bkey *bch_bset_search(struct btree_keys *b,
332 struct bset_tree *t,
333 const struct bkey *search)
334{
335 return search ? __bch_bset_search(b, t, search) : t->data->start;
336}
337
338#define for_each_key_filter(b, k, iter, filter) \
339 for (bch_btree_iter_init((b), (iter), NULL); \
340 ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
341
342#define for_each_key(b, k, iter) \
343 for (bch_btree_iter_init((b), (iter), NULL); \
344 ((k) = bch_btree_iter_next(iter));)
345
346/* Sorting */
347
348struct bset_sort_state {
349 mempool_t *pool;
350
351 unsigned page_order;
352 unsigned crit_factor;
353
354 struct time_stats time;
355};
356
357void bch_bset_sort_state_free(struct bset_sort_state *);
358int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
359void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
360void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
361 struct bset_sort_state *);
362void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
363 struct bset_sort_state *);
364void bch_btree_sort_partial(struct btree_keys *, unsigned,
365 struct bset_sort_state *);
366
367static inline void bch_btree_sort(struct btree_keys *b,
368 struct bset_sort_state *state)
369{
370 bch_btree_sort_partial(b, 0, state);
371}
372
373struct bset_stats {
374 size_t sets_written, sets_unwritten;
375 size_t bytes_written, bytes_unwritten;
376 size_t floats, failed;
377};
378
379void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
380
381/* Bkey utility code */
382
383#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
384
385static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
386{
387 return bkey_idx(i->start, idx);
388}
389
390static inline void bkey_init(struct bkey *k)
391{
392 *k = ZERO_KEY;
393}
394
191static __always_inline int64_t bkey_cmp(const struct bkey *l, 395static __always_inline int64_t bkey_cmp(const struct bkey *l,
192 const struct bkey *r) 396 const struct bkey *r)
193{ 397{
@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
196 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); 400 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
197} 401}
198 402
403void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
404 unsigned);
405bool __bch_cut_front(const struct bkey *, struct bkey *);
406bool __bch_cut_back(const struct bkey *, struct bkey *);
407
408static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
409{
410 BUG_ON(bkey_cmp(where, k) > 0);
411 return __bch_cut_front(where, k);
412}
413
414static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
415{
416 BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
417 return __bch_cut_back(where, k);
418}
419
420#define PRECEDING_KEY(_k) \
421({ \
422 struct bkey *_ret = NULL; \
423 \
424 if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
425 _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
426 \
427 if (!_ret->low) \
428 _ret->high--; \
429 _ret->low--; \
430 } \
431 \
432 _ret; \
433})
434
435static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
436{
437 return b->ops->key_invalid(b, k);
438}
439
440static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
441{
442 return b->ops->key_bad(b, k);
443}
444
445static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
446 size_t size, const struct bkey *k)
447{
448 return b->ops->key_to_text(buf, size, k);
449}
450
451static inline bool bch_bkey_equal_header(const struct bkey *l,
452 const struct bkey *r)
453{
454 return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
455 KEY_PTRS(l) == KEY_PTRS(r) &&
456 KEY_CSUM(l) == KEY_CSUM(l));
457}
458
199/* Keylists */ 459/* Keylists */
200 460
201struct keylist { 461struct keylist {
@@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l)
257 517
258struct bkey *bch_keylist_pop(struct keylist *); 518struct bkey *bch_keylist_pop(struct keylist *);
259void bch_keylist_pop_front(struct keylist *); 519void bch_keylist_pop_front(struct keylist *);
260int bch_keylist_realloc(struct keylist *, int, struct cache_set *); 520int __bch_keylist_realloc(struct keylist *, unsigned);
261
262void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
263 unsigned);
264bool __bch_cut_front(const struct bkey *, struct bkey *);
265bool __bch_cut_back(const struct bkey *, struct bkey *);
266 521
267static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) 522/* Debug stuff */
268{
269 BUG_ON(bkey_cmp(where, k) > 0);
270 return __bch_cut_front(where, k);
271}
272 523
273static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) 524#ifdef CONFIG_BCACHE_DEBUG
274{
275 BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
276 return __bch_cut_back(where, k);
277}
278
279const char *bch_ptr_status(struct cache_set *, const struct bkey *);
280bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
281bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
282
283bool bch_ptr_bad(struct btree *, const struct bkey *);
284
285static inline uint8_t gen_after(uint8_t a, uint8_t b)
286{
287 uint8_t r = a - b;
288 return r > 128U ? 0 : r;
289}
290
291static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
292 unsigned i)
293{
294 return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
295}
296
297static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
298 unsigned i)
299{
300 return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
301}
302
303
304typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
305
306struct bkey *bch_btree_iter_next(struct btree_iter *);
307struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
308 struct btree *, ptr_filter_fn);
309
310void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
311struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
312 struct bkey *, struct bset_tree *);
313
314/* 32 bits total: */
315#define BKEY_MID_BITS 3
316#define BKEY_EXPONENT_BITS 7
317#define BKEY_MANTISSA_BITS 22
318#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
319
320struct bkey_float {
321 unsigned exponent:BKEY_EXPONENT_BITS;
322 unsigned m:BKEY_MID_BITS;
323 unsigned mantissa:BKEY_MANTISSA_BITS;
324} __packed;
325
326/*
327 * BSET_CACHELINE was originally intended to match the hardware cacheline size -
328 * it used to be 64, but I realized the lookup code would touch slightly less
329 * memory if it was 128.
330 *
331 * It definites the number of bytes (in struct bset) per struct bkey_float in
332 * the auxiliar search tree - when we're done searching the bset_float tree we
333 * have this many bytes left that we do a linear search over.
334 *
335 * Since (after level 5) every level of the bset_tree is on a new cacheline,
336 * we're touching one fewer cacheline in the bset tree in exchange for one more
337 * cacheline in the linear search - but the linear search might stop before it
338 * gets to the second cacheline.
339 */
340
341#define BSET_CACHELINE 128
342#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE)
343 525
344#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) 526int __bch_count_data(struct btree_keys *);
345#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) 527void __bch_check_keys(struct btree_keys *, const char *, ...);
528void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
529void bch_dump_bucket(struct btree_keys *);
346 530
347void bch_bset_init_next(struct btree *); 531#else
348 532
349void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); 533static inline int __bch_count_data(struct btree_keys *b) { return -1; }
350void bch_bset_fix_lookup_table(struct btree *, struct bkey *); 534static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
535static inline void bch_dump_bucket(struct btree_keys *b) {}
536void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
351 537
352struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, 538#endif
353 const struct bkey *);
354 539
355/* 540static inline bool btree_keys_expensive_checks(struct btree_keys *b)
356 * Returns the first key that is strictly greater than search
357 */
358static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
359 const struct bkey *search)
360{ 541{
361 return search ? __bch_bset_search(b, t, search) : t->data->start; 542#ifdef CONFIG_BCACHE_DEBUG
543 return *b->expensive_debug_checks;
544#else
545 return false;
546#endif
362} 547}
363 548
364#define PRECEDING_KEY(_k) \ 549static inline int bch_count_data(struct btree_keys *b)
365({ \
366 struct bkey *_ret = NULL; \
367 \
368 if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
369 _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
370 \
371 if (!_ret->low) \
372 _ret->high--; \
373 _ret->low--; \
374 } \
375 \
376 _ret; \
377})
378
379bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
380void bch_btree_sort_lazy(struct btree *);
381void bch_btree_sort_into(struct btree *, struct btree *);
382void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
383void bch_btree_sort_partial(struct btree *, unsigned);
384
385static inline void bch_btree_sort(struct btree *b)
386{ 550{
387 bch_btree_sort_partial(b, 0); 551 return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
388} 552}
389 553
390int bch_bset_print_stats(struct cache_set *, char *); 554#define bch_check_keys(b, ...) \
555do { \
556 if (btree_keys_expensive_checks(b)) \
557 __bch_check_keys(b, __VA_ARGS__); \
558} while (0)
391 559
392#endif 560#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 31bb53fcc67a..5f9c2a665ca5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,7 +23,7 @@
23#include "bcache.h" 23#include "bcache.h"
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "writeback.h" 26#include "extents.h"
27 27
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/bitops.h> 29#include <linux/bitops.h>
@@ -89,13 +89,6 @@
89 * Test module load/unload 89 * Test module load/unload
90 */ 90 */
91 91
92enum {
93 BTREE_INSERT_STATUS_INSERT,
94 BTREE_INSERT_STATUS_BACK_MERGE,
95 BTREE_INSERT_STATUS_OVERWROTE,
96 BTREE_INSERT_STATUS_FRONT_MERGE,
97};
98
99#define MAX_NEED_GC 64 92#define MAX_NEED_GC 64
100#define MAX_SAVE_PRIO 72 93#define MAX_SAVE_PRIO 72
101 94
@@ -106,14 +99,6 @@ enum {
106 99
107static struct workqueue_struct *btree_io_wq; 100static struct workqueue_struct *btree_io_wq;
108 101
109static inline bool should_split(struct btree *b)
110{
111 struct bset *i = write_block(b);
112 return b->written >= btree_blocks(b) ||
113 (b->written + __set_blocks(i, i->keys + 15, b->c)
114 > btree_blocks(b));
115}
116
117#define insert_lock(s, b) ((b)->level <= (s)->lock) 102#define insert_lock(s, b) ((b)->level <= (s)->lock)
118 103
119/* 104/*
@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b)
167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 152 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
168 } \ 153 } \
169 rw_unlock(_w, _b); \ 154 rw_unlock(_w, _b); \
155 if (_r == -EINTR) \
156 schedule(); \
170 bch_cannibalize_unlock(c); \ 157 bch_cannibalize_unlock(c); \
171 if (_r == -ENOSPC) { \ 158 if (_r == -ENOSPC) { \
172 wait_event((c)->try_wait, \ 159 wait_event((c)->try_wait, \
@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b)
175 } \ 162 } \
176 } while (_r == -EINTR); \ 163 } while (_r == -EINTR); \
177 \ 164 \
165 finish_wait(&(c)->bucket_wait, &(op)->wait); \
178 _r; \ 166 _r; \
179}) 167})
180 168
169static inline struct bset *write_block(struct btree *b)
170{
171 return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
172}
173
181/* Btree key manipulation */ 174/* Btree key manipulation */
182 175
183void bkey_put(struct cache_set *c, struct bkey *k) 176void bkey_put(struct cache_set *c, struct bkey *k)
@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k)
194static uint64_t btree_csum_set(struct btree *b, struct bset *i) 187static uint64_t btree_csum_set(struct btree *b, struct bset *i)
195{ 188{
196 uint64_t crc = b->key.ptr[0]; 189 uint64_t crc = b->key.ptr[0];
197 void *data = (void *) i + 8, *end = end(i); 190 void *data = (void *) i + 8, *end = bset_bkey_last(i);
198 191
199 crc = bch_crc64_update(crc, data, end - data); 192 crc = bch_crc64_update(crc, data, end - data);
200 return crc ^ 0xffffffffffffffffULL; 193 return crc ^ 0xffffffffffffffffULL;
201} 194}
202 195
203static void bch_btree_node_read_done(struct btree *b) 196void bch_btree_node_read_done(struct btree *b)
204{ 197{
205 const char *err = "bad btree header"; 198 const char *err = "bad btree header";
206 struct bset *i = b->sets[0].data; 199 struct bset *i = btree_bset_first(b);
207 struct btree_iter *iter; 200 struct btree_iter *iter;
208 201
209 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); 202 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b)
211 iter->used = 0; 204 iter->used = 0;
212 205
213#ifdef CONFIG_BCACHE_DEBUG 206#ifdef CONFIG_BCACHE_DEBUG
214 iter->b = b; 207 iter->b = &b->keys;
215#endif 208#endif
216 209
217 if (!i->seq) 210 if (!i->seq)
218 goto err; 211 goto err;
219 212
220 for (; 213 for (;
221 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; 214 b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
222 i = write_block(b)) { 215 i = write_block(b)) {
223 err = "unsupported bset version"; 216 err = "unsupported bset version";
224 if (i->version > BCACHE_BSET_VERSION) 217 if (i->version > BCACHE_BSET_VERSION)
225 goto err; 218 goto err;
226 219
227 err = "bad btree header"; 220 err = "bad btree header";
228 if (b->written + set_blocks(i, b->c) > btree_blocks(b)) 221 if (b->written + set_blocks(i, block_bytes(b->c)) >
222 btree_blocks(b))
229 goto err; 223 goto err;
230 224
231 err = "bad magic"; 225 err = "bad magic";
@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b)
245 } 239 }
246 240
247 err = "empty set"; 241 err = "empty set";
248 if (i != b->sets[0].data && !i->keys) 242 if (i != b->keys.set[0].data && !i->keys)
249 goto err; 243 goto err;
250 244
251 bch_btree_iter_push(iter, i->start, end(i)); 245 bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
252 246
253 b->written += set_blocks(i, b->c); 247 b->written += set_blocks(i, block_bytes(b->c));
254 } 248 }
255 249
256 err = "corrupted btree"; 250 err = "corrupted btree";
257 for (i = write_block(b); 251 for (i = write_block(b);
258 index(i, b) < btree_blocks(b); 252 bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
259 i = ((void *) i) + block_bytes(b->c)) 253 i = ((void *) i) + block_bytes(b->c))
260 if (i->seq == b->sets[0].data->seq) 254 if (i->seq == b->keys.set[0].data->seq)
261 goto err; 255 goto err;
262 256
263 bch_btree_sort_and_fix_extents(b, iter); 257 bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
264 258
265 i = b->sets[0].data; 259 i = b->keys.set[0].data;
266 err = "short btree key"; 260 err = "short btree key";
267 if (b->sets[0].size && 261 if (b->keys.set[0].size &&
268 bkey_cmp(&b->key, &b->sets[0].end) < 0) 262 bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
269 goto err; 263 goto err;
270 264
271 if (b->written < btree_blocks(b)) 265 if (b->written < btree_blocks(b))
272 bch_bset_init_next(b); 266 bch_bset_init_next(&b->keys, write_block(b),
267 bset_magic(&b->c->sb));
273out: 268out:
274 mempool_free(iter, b->c->fill_iter); 269 mempool_free(iter, b->c->fill_iter);
275 return; 270 return;
276err: 271err:
277 set_btree_node_io_error(b); 272 set_btree_node_io_error(b);
278 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 273 bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
279 err, PTR_BUCKET_NR(b->c, &b->key, 0), 274 err, PTR_BUCKET_NR(b->c, &b->key, 0),
280 index(i, b), i->keys); 275 bset_block_offset(b, i), i->keys);
281 goto out; 276 goto out;
282} 277}
283 278
@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error)
287 closure_put(cl); 282 closure_put(cl);
288} 283}
289 284
290void bch_btree_node_read(struct btree *b) 285static void bch_btree_node_read(struct btree *b)
291{ 286{
292 uint64_t start_time = local_clock(); 287 uint64_t start_time = local_clock();
293 struct closure cl; 288 struct closure cl;
@@ -299,11 +294,11 @@ void bch_btree_node_read(struct btree *b)
299 294
300 bio = bch_bbio_alloc(b->c); 295 bio = bch_bbio_alloc(b->c);
301 bio->bi_rw = REQ_META|READ_SYNC; 296 bio->bi_rw = REQ_META|READ_SYNC;
302 bio->bi_size = KEY_SIZE(&b->key) << 9; 297 bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
303 bio->bi_end_io = btree_node_read_endio; 298 bio->bi_end_io = btree_node_read_endio;
304 bio->bi_private = &cl; 299 bio->bi_private = &cl;
305 300
306 bch_bio_map(bio, b->sets[0].data); 301 bch_bio_map(bio, b->keys.set[0].data);
307 302
308 bch_submit_bbio(bio, b->c, &b->key, 0); 303 bch_submit_bbio(bio, b->c, &b->key, 0);
309 closure_sync(&cl); 304 closure_sync(&cl);
@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
340 w->journal = NULL; 335 w->journal = NULL;
341} 336}
342 337
338static void btree_node_write_unlock(struct closure *cl)
339{
340 struct btree *b = container_of(cl, struct btree, io);
341
342 up(&b->io_mutex);
343}
344
343static void __btree_node_write_done(struct closure *cl) 345static void __btree_node_write_done(struct closure *cl)
344{ 346{
345 struct btree *b = container_of(cl, struct btree, io.cl); 347 struct btree *b = container_of(cl, struct btree, io);
346 struct btree_write *w = btree_prev_write(b); 348 struct btree_write *w = btree_prev_write(b);
347 349
348 bch_bbio_free(b->bio, b->c); 350 bch_bbio_free(b->bio, b->c);
@@ -353,16 +355,16 @@ static void __btree_node_write_done(struct closure *cl)
353 queue_delayed_work(btree_io_wq, &b->work, 355 queue_delayed_work(btree_io_wq, &b->work,
354 msecs_to_jiffies(30000)); 356 msecs_to_jiffies(30000));
355 357
356 closure_return(cl); 358 closure_return_with_destructor(cl, btree_node_write_unlock);
357} 359}
358 360
359static void btree_node_write_done(struct closure *cl) 361static void btree_node_write_done(struct closure *cl)
360{ 362{
361 struct btree *b = container_of(cl, struct btree, io.cl); 363 struct btree *b = container_of(cl, struct btree, io);
362 struct bio_vec *bv; 364 struct bio_vec *bv;
363 int n; 365 int n;
364 366
365 __bio_for_each_segment(bv, b->bio, n, 0) 367 bio_for_each_segment_all(bv, b->bio, n)
366 __free_page(bv->bv_page); 368 __free_page(bv->bv_page);
367 369
368 __btree_node_write_done(cl); 370 __btree_node_write_done(cl);
@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl)
371static void btree_node_write_endio(struct bio *bio, int error) 373static void btree_node_write_endio(struct bio *bio, int error)
372{ 374{
373 struct closure *cl = bio->bi_private; 375 struct closure *cl = bio->bi_private;
374 struct btree *b = container_of(cl, struct btree, io.cl); 376 struct btree *b = container_of(cl, struct btree, io);
375 377
376 if (error) 378 if (error)
377 set_btree_node_io_error(b); 379 set_btree_node_io_error(b);
@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error)
382 384
383static void do_btree_node_write(struct btree *b) 385static void do_btree_node_write(struct btree *b)
384{ 386{
385 struct closure *cl = &b->io.cl; 387 struct closure *cl = &b->io;
386 struct bset *i = b->sets[b->nsets].data; 388 struct bset *i = btree_bset_last(b);
387 BKEY_PADDED(key) k; 389 BKEY_PADDED(key) k;
388 390
389 i->version = BCACHE_BSET_VERSION; 391 i->version = BCACHE_BSET_VERSION;
@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b)
395 b->bio->bi_end_io = btree_node_write_endio; 397 b->bio->bi_end_io = btree_node_write_endio;
396 b->bio->bi_private = cl; 398 b->bio->bi_private = cl;
397 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 399 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
398 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 400 b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
399 bch_bio_map(b->bio, i); 401 bch_bio_map(b->bio, i);
400 402
401 /* 403 /*
@@ -414,14 +416,15 @@ static void do_btree_node_write(struct btree *b)
414 */ 416 */
415 417
416 bkey_copy(&k.key, &b->key); 418 bkey_copy(&k.key, &b->key);
417 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 419 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
420 bset_sector_offset(&b->keys, i));
418 421
419 if (!bio_alloc_pages(b->bio, GFP_NOIO)) { 422 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
420 int j; 423 int j;
421 struct bio_vec *bv; 424 struct bio_vec *bv;
422 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 425 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
423 426
424 bio_for_each_segment(bv, b->bio, j) 427 bio_for_each_segment_all(bv, b->bio, j)
425 memcpy(page_address(bv->bv_page), 428 memcpy(page_address(bv->bv_page),
426 base + j * PAGE_SIZE, PAGE_SIZE); 429 base + j * PAGE_SIZE, PAGE_SIZE);
427 430
@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b)
435 bch_submit_bbio(b->bio, b->c, &k.key, 0); 438 bch_submit_bbio(b->bio, b->c, &k.key, 0);
436 439
437 closure_sync(cl); 440 closure_sync(cl);
438 __btree_node_write_done(cl); 441 continue_at_nobarrier(cl, __btree_node_write_done, NULL);
439 } 442 }
440} 443}
441 444
442void bch_btree_node_write(struct btree *b, struct closure *parent) 445void bch_btree_node_write(struct btree *b, struct closure *parent)
443{ 446{
444 struct bset *i = b->sets[b->nsets].data; 447 struct bset *i = btree_bset_last(b);
445 448
446 trace_bcache_btree_write(b); 449 trace_bcache_btree_write(b);
447 450
448 BUG_ON(current->bio_list); 451 BUG_ON(current->bio_list);
449 BUG_ON(b->written >= btree_blocks(b)); 452 BUG_ON(b->written >= btree_blocks(b));
450 BUG_ON(b->written && !i->keys); 453 BUG_ON(b->written && !i->keys);
451 BUG_ON(b->sets->data->seq != i->seq); 454 BUG_ON(btree_bset_first(b)->seq != i->seq);
452 bch_check_keys(b, "writing"); 455 bch_check_keys(&b->keys, "writing");
453 456
454 cancel_delayed_work(&b->work); 457 cancel_delayed_work(&b->work);
455 458
456 /* If caller isn't waiting for write, parent refcount is cache set */ 459 /* If caller isn't waiting for write, parent refcount is cache set */
457 closure_lock(&b->io, parent ?: &b->c->cl); 460 down(&b->io_mutex);
461 closure_init(&b->io, parent ?: &b->c->cl);
458 462
459 clear_bit(BTREE_NODE_dirty, &b->flags); 463 clear_bit(BTREE_NODE_dirty, &b->flags);
460 change_bit(BTREE_NODE_write_idx, &b->flags); 464 change_bit(BTREE_NODE_write_idx, &b->flags);
461 465
462 do_btree_node_write(b); 466 do_btree_node_write(b);
463 467
464 b->written += set_blocks(i, b->c); 468 atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
465 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
466 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 469 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
467 470
468 bch_btree_sort_lazy(b); 471 b->written += set_blocks(i, block_bytes(b->c));
472
473 /* If not a leaf node, always sort */
474 if (b->level && b->keys.nsets)
475 bch_btree_sort(&b->keys, &b->c->sort);
476 else
477 bch_btree_sort_lazy(&b->keys, &b->c->sort);
478
479 /*
480 * do verify if there was more than one set initially (i.e. we did a
481 * sort) and we sorted down to a single set:
482 */
483 if (i != b->keys.set->data && !b->keys.nsets)
484 bch_btree_verify(b);
469 485
470 if (b->written < btree_blocks(b)) 486 if (b->written < btree_blocks(b))
471 bch_bset_init_next(b); 487 bch_bset_init_next(&b->keys, write_block(b),
488 bset_magic(&b->c->sb));
472} 489}
473 490
474static void bch_btree_node_write_sync(struct btree *b) 491static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w)
493 510
494static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 511static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
495{ 512{
496 struct bset *i = b->sets[b->nsets].data; 513 struct bset *i = btree_bset_last(b);
497 struct btree_write *w = btree_current_write(b); 514 struct btree_write *w = btree_current_write(b);
498 515
499 BUG_ON(!b->written); 516 BUG_ON(!b->written);
@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
528 * mca -> memory cache 545 * mca -> memory cache
529 */ 546 */
530 547
531static void mca_reinit(struct btree *b)
532{
533 unsigned i;
534
535 b->flags = 0;
536 b->written = 0;
537 b->nsets = 0;
538
539 for (i = 0; i < MAX_BSETS; i++)
540 b->sets[i].size = 0;
541 /*
542 * Second loop starts at 1 because b->sets[0]->data is the memory we
543 * allocated
544 */
545 for (i = 1; i < MAX_BSETS; i++)
546 b->sets[i].data = NULL;
547}
548
549#define mca_reserve(c) (((c->root && c->root->level) \ 548#define mca_reserve(c) (((c->root && c->root->level) \
550 ? c->root->level : 1) * 8 + 16) 549 ? c->root->level : 1) * 8 + 16)
551#define mca_can_free(c) \ 550#define mca_can_free(c) \
@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b)
553 552
554static void mca_data_free(struct btree *b) 553static void mca_data_free(struct btree *b)
555{ 554{
556 struct bset_tree *t = b->sets; 555 BUG_ON(b->io_mutex.count != 1);
557 BUG_ON(!closure_is_unlocked(&b->io.cl));
558 556
559 if (bset_prev_bytes(b) < PAGE_SIZE) 557 bch_btree_keys_free(&b->keys);
560 kfree(t->prev);
561 else
562 free_pages((unsigned long) t->prev,
563 get_order(bset_prev_bytes(b)));
564 558
565 if (bset_tree_bytes(b) < PAGE_SIZE)
566 kfree(t->tree);
567 else
568 free_pages((unsigned long) t->tree,
569 get_order(bset_tree_bytes(b)));
570
571 free_pages((unsigned long) t->data, b->page_order);
572
573 t->prev = NULL;
574 t->tree = NULL;
575 t->data = NULL;
576 list_move(&b->list, &b->c->btree_cache_freed);
577 b->c->bucket_cache_used--; 559 b->c->bucket_cache_used--;
560 list_move(&b->list, &b->c->btree_cache_freed);
578} 561}
579 562
580static void mca_bucket_free(struct btree *b) 563static void mca_bucket_free(struct btree *b)
@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k)
593 576
594static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 577static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
595{ 578{
596 struct bset_tree *t = b->sets; 579 if (!bch_btree_keys_alloc(&b->keys,
597 BUG_ON(t->data); 580 max_t(unsigned,
598 581 ilog2(b->c->btree_pages),
599 b->page_order = max_t(unsigned, 582 btree_order(k)),
600 ilog2(b->c->btree_pages), 583 gfp)) {
601 btree_order(k)); 584 b->c->bucket_cache_used++;
602 585 list_move(&b->list, &b->c->btree_cache);
603 t->data = (void *) __get_free_pages(gfp, b->page_order); 586 } else {
604 if (!t->data) 587 list_move(&b->list, &b->c->btree_cache_freed);
605 goto err; 588 }
606
607 t->tree = bset_tree_bytes(b) < PAGE_SIZE
608 ? kmalloc(bset_tree_bytes(b), gfp)
609 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
610 if (!t->tree)
611 goto err;
612
613 t->prev = bset_prev_bytes(b) < PAGE_SIZE
614 ? kmalloc(bset_prev_bytes(b), gfp)
615 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
616 if (!t->prev)
617 goto err;
618
619 list_move(&b->list, &b->c->btree_cache);
620 b->c->bucket_cache_used++;
621 return;
622err:
623 mca_data_free(b);
624} 589}
625 590
626static struct btree *mca_bucket_alloc(struct cache_set *c, 591static struct btree *mca_bucket_alloc(struct cache_set *c,
@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
635 INIT_LIST_HEAD(&b->list); 600 INIT_LIST_HEAD(&b->list);
636 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 601 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
637 b->c = c; 602 b->c = c;
638 closure_init_unlocked(&b->io); 603 sema_init(&b->io_mutex, 1);
639 604
640 mca_data_alloc(b, k, gfp); 605 mca_data_alloc(b, k, gfp);
641 return b; 606 return b;
@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
651 if (!down_write_trylock(&b->lock)) 616 if (!down_write_trylock(&b->lock))
652 return -ENOMEM; 617 return -ENOMEM;
653 618
654 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 619 BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
655 620
656 if (b->page_order < min_order || 621 if (b->keys.page_order < min_order)
657 (!flush && 622 goto out_unlock;
658 (btree_node_dirty(b) || 623
659 atomic_read(&b->io.cl.remaining) != -1))) { 624 if (!flush) {
660 rw_unlock(true, b); 625 if (btree_node_dirty(b))
661 return -ENOMEM; 626 goto out_unlock;
627
628 if (down_trylock(&b->io_mutex))
629 goto out_unlock;
630 up(&b->io_mutex);
662 } 631 }
663 632
664 if (btree_node_dirty(b)) 633 if (btree_node_dirty(b))
665 bch_btree_node_write_sync(b); 634 bch_btree_node_write_sync(b);
666 635
667 /* wait for any in flight btree write */ 636 /* wait for any in flight btree write */
668 closure_wait_event(&b->io.wait, &cl, 637 down(&b->io_mutex);
669 atomic_read(&b->io.cl.remaining) == -1); 638 up(&b->io_mutex);
670 639
671 return 0; 640 return 0;
641out_unlock:
642 rw_unlock(true, b);
643 return -ENOMEM;
672} 644}
673 645
674static unsigned long bch_mca_scan(struct shrinker *shrink, 646static unsigned long bch_mca_scan(struct shrinker *shrink,
@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
714 } 686 }
715 } 687 }
716 688
717 /*
718 * Can happen right when we first start up, before we've read in any
719 * btree nodes
720 */
721 if (list_empty(&c->btree_cache))
722 goto out;
723
724 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { 689 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
690 if (list_empty(&c->btree_cache))
691 goto out;
692
725 b = list_first_entry(&c->btree_cache, struct btree, list); 693 b = list_first_entry(&c->btree_cache, struct btree, list);
726 list_rotate_left(&c->btree_cache); 694 list_rotate_left(&c->btree_cache);
727 695
@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c)
767#ifdef CONFIG_BCACHE_DEBUG 735#ifdef CONFIG_BCACHE_DEBUG
768 if (c->verify_data) 736 if (c->verify_data)
769 list_move(&c->verify_data->list, &c->btree_cache); 737 list_move(&c->verify_data->list, &c->btree_cache);
738
739 free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
770#endif 740#endif
771 741
772 list_splice(&c->btree_cache_freeable, 742 list_splice(&c->btree_cache_freeable,
@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c)
807#ifdef CONFIG_BCACHE_DEBUG 777#ifdef CONFIG_BCACHE_DEBUG
808 mutex_init(&c->verify_lock); 778 mutex_init(&c->verify_lock);
809 779
780 c->verify_ondisk = (void *)
781 __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
782
810 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 783 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
811 784
812 if (c->verify_data && 785 if (c->verify_data &&
813 c->verify_data->sets[0].data) 786 c->verify_data->keys.set->data)
814 list_del_init(&c->verify_data->list); 787 list_del_init(&c->verify_data->list);
815 else 788 else
816 c->verify_data = NULL; 789 c->verify_data = NULL;
@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
908 list_for_each_entry(b, &c->btree_cache_freed, list) 881 list_for_each_entry(b, &c->btree_cache_freed, list)
909 if (!mca_reap(b, 0, false)) { 882 if (!mca_reap(b, 0, false)) {
910 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 883 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
911 if (!b->sets[0].data) 884 if (!b->keys.set[0].data)
912 goto err; 885 goto err;
913 else 886 else
914 goto out; 887 goto out;
@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
919 goto err; 892 goto err;
920 893
921 BUG_ON(!down_write_trylock(&b->lock)); 894 BUG_ON(!down_write_trylock(&b->lock));
922 if (!b->sets->data) 895 if (!b->keys.set->data)
923 goto err; 896 goto err;
924out: 897out:
925 BUG_ON(!closure_is_unlocked(&b->io.cl)); 898 BUG_ON(b->io_mutex.count != 1);
926 899
927 bkey_copy(&b->key, k); 900 bkey_copy(&b->key, k);
928 list_move(&b->list, &c->btree_cache); 901 list_move(&b->list, &c->btree_cache);
@@ -930,10 +903,17 @@ out:
930 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 903 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
931 904
932 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 905 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
933 b->level = level;
934 b->parent = (void *) ~0UL; 906 b->parent = (void *) ~0UL;
907 b->flags = 0;
908 b->written = 0;
909 b->level = level;
935 910
936 mca_reinit(b); 911 if (!b->level)
912 bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
913 &b->c->expensive_debug_checks);
914 else
915 bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
916 &b->c->expensive_debug_checks);
937 917
938 return b; 918 return b;
939err: 919err:
@@ -994,13 +974,13 @@ retry:
994 974
995 b->accessed = 1; 975 b->accessed = 1;
996 976
997 for (; i <= b->nsets && b->sets[i].size; i++) { 977 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
998 prefetch(b->sets[i].tree); 978 prefetch(b->keys.set[i].tree);
999 prefetch(b->sets[i].data); 979 prefetch(b->keys.set[i].data);
1000 } 980 }
1001 981
1002 for (; i <= b->nsets; i++) 982 for (; i <= b->keys.nsets; i++)
1003 prefetch(b->sets[i].data); 983 prefetch(b->keys.set[i].data);
1004 984
1005 if (btree_node_io_error(b)) { 985 if (btree_node_io_error(b)) {
1006 rw_unlock(write, b); 986 rw_unlock(write, b);
@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
1063 1043
1064 mutex_lock(&c->bucket_lock); 1044 mutex_lock(&c->bucket_lock);
1065retry: 1045retry:
1066 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) 1046 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1067 goto err; 1047 goto err;
1068 1048
1069 bkey_put(c, &k.key); 1049 bkey_put(c, &k.key);
@@ -1080,7 +1060,7 @@ retry:
1080 } 1060 }
1081 1061
1082 b->accessed = 1; 1062 b->accessed = 1;
1083 bch_bset_init_next(b); 1063 bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1084 1064
1085 mutex_unlock(&c->bucket_lock); 1065 mutex_unlock(&c->bucket_lock);
1086 1066
@@ -1098,8 +1078,10 @@ err:
1098static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) 1078static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
1099{ 1079{
1100 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); 1080 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
1101 if (!IS_ERR_OR_NULL(n)) 1081 if (!IS_ERR_OR_NULL(n)) {
1102 bch_btree_sort_into(b, n); 1082 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1083 bkey_copy_key(&n->key, &b->key);
1084 }
1103 1085
1104 return n; 1086 return n;
1105} 1087}
@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1120 atomic_inc(&b->c->prio_blocked); 1102 atomic_inc(&b->c->prio_blocked);
1121} 1103}
1122 1104
1105static int btree_check_reserve(struct btree *b, struct btree_op *op)
1106{
1107 struct cache_set *c = b->c;
1108 struct cache *ca;
1109 unsigned i, reserve = c->root->level * 2 + 1;
1110 int ret = 0;
1111
1112 mutex_lock(&c->bucket_lock);
1113
1114 for_each_cache(ca, c, i)
1115 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1116 if (op)
1117 prepare_to_wait(&c->bucket_wait, &op->wait,
1118 TASK_UNINTERRUPTIBLE);
1119 ret = -EINTR;
1120 break;
1121 }
1122
1123 mutex_unlock(&c->bucket_lock);
1124 return ret;
1125}
1126
1123/* Garbage collection */ 1127/* Garbage collection */
1124 1128
1125uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1129uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -1163,7 +1167,7 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1163 /* guard against overflow */ 1167 /* guard against overflow */
1164 SET_GC_SECTORS_USED(g, min_t(unsigned, 1168 SET_GC_SECTORS_USED(g, min_t(unsigned,
1165 GC_SECTORS_USED(g) + KEY_SIZE(k), 1169 GC_SECTORS_USED(g) + KEY_SIZE(k),
1166 (1 << 14) - 1)); 1170 MAX_GC_SECTORS_USED));
1167 1171
1168 BUG_ON(!GC_SECTORS_USED(g)); 1172 BUG_ON(!GC_SECTORS_USED(g));
1169 } 1173 }
@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1183 1187
1184 gc->nodes++; 1188 gc->nodes++;
1185 1189
1186 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1190 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1187 stale = max(stale, btree_mark_key(b, k)); 1191 stale = max(stale, btree_mark_key(b, k));
1188 keys++; 1192 keys++;
1189 1193
1190 if (bch_ptr_bad(b, k)) 1194 if (bch_ptr_bad(&b->keys, k))
1191 continue; 1195 continue;
1192 1196
1193 gc->key_bytes += bkey_u64s(k); 1197 gc->key_bytes += bkey_u64s(k);
@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1197 gc->data += KEY_SIZE(k); 1201 gc->data += KEY_SIZE(k);
1198 } 1202 }
1199 1203
1200 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1204 for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1201 btree_bug_on(t->size && 1205 btree_bug_on(t->size &&
1202 bset_written(b, t) && 1206 bset_written(&b->keys, t) &&
1203 bkey_cmp(&b->key, &t->end) < 0, 1207 bkey_cmp(&b->key, &t->end) < 0,
1204 b, "found short btree key in gc"); 1208 b, "found short btree key in gc");
1205 1209
@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1243 blocks = btree_default_blocks(b->c) * 2 / 3; 1247 blocks = btree_default_blocks(b->c) * 2 / 3;
1244 1248
1245 if (nodes < 2 || 1249 if (nodes < 2 ||
1246 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1250 __set_blocks(b->keys.set[0].data, keys,
1251 block_bytes(b->c)) > blocks * (nodes - 1))
1247 return 0; 1252 return 0;
1248 1253
1249 for (i = 0; i < nodes; i++) { 1254 for (i = 0; i < nodes; i++) {
@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1253 } 1258 }
1254 1259
1255 for (i = nodes - 1; i > 0; --i) { 1260 for (i = nodes - 1; i > 0; --i) {
1256 struct bset *n1 = new_nodes[i]->sets->data; 1261 struct bset *n1 = btree_bset_first(new_nodes[i]);
1257 struct bset *n2 = new_nodes[i - 1]->sets->data; 1262 struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
1258 struct bkey *k, *last = NULL; 1263 struct bkey *k, *last = NULL;
1259 1264
1260 keys = 0; 1265 keys = 0;
1261 1266
1262 if (i > 1) { 1267 if (i > 1) {
1263 for (k = n2->start; 1268 for (k = n2->start;
1264 k < end(n2); 1269 k < bset_bkey_last(n2);
1265 k = bkey_next(k)) { 1270 k = bkey_next(k)) {
1266 if (__set_blocks(n1, n1->keys + keys + 1271 if (__set_blocks(n1, n1->keys + keys +
1267 bkey_u64s(k), b->c) > blocks) 1272 bkey_u64s(k),
1273 block_bytes(b->c)) > blocks)
1268 break; 1274 break;
1269 1275
1270 last = k; 1276 last = k;
@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1280 * though) 1286 * though)
1281 */ 1287 */
1282 if (__set_blocks(n1, n1->keys + n2->keys, 1288 if (__set_blocks(n1, n1->keys + n2->keys,
1283 b->c) > btree_blocks(new_nodes[i])) 1289 block_bytes(b->c)) >
1290 btree_blocks(new_nodes[i]))
1284 goto out_nocoalesce; 1291 goto out_nocoalesce;
1285 1292
1286 keys = n2->keys; 1293 keys = n2->keys;
@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1288 last = &r->b->key; 1295 last = &r->b->key;
1289 } 1296 }
1290 1297
1291 BUG_ON(__set_blocks(n1, n1->keys + keys, 1298 BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
1292 b->c) > btree_blocks(new_nodes[i])); 1299 btree_blocks(new_nodes[i]));
1293 1300
1294 if (last) 1301 if (last)
1295 bkey_copy_key(&new_nodes[i]->key, last); 1302 bkey_copy_key(&new_nodes[i]->key, last);
1296 1303
1297 memcpy(end(n1), 1304 memcpy(bset_bkey_last(n1),
1298 n2->start, 1305 n2->start,
1299 (void *) node(n2, keys) - (void *) n2->start); 1306 (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
1300 1307
1301 n1->keys += keys; 1308 n1->keys += keys;
1302 r[i].keys = n1->keys; 1309 r[i].keys = n1->keys;
1303 1310
1304 memmove(n2->start, 1311 memmove(n2->start,
1305 node(n2, keys), 1312 bset_bkey_idx(n2, keys),
1306 (void *) end(n2) - (void *) node(n2, keys)); 1313 (void *) bset_bkey_last(n2) -
1314 (void *) bset_bkey_idx(n2, keys));
1307 1315
1308 n2->keys -= keys; 1316 n2->keys -= keys;
1309 1317
1310 if (bch_keylist_realloc(keylist, 1318 if (__bch_keylist_realloc(keylist,
1311 KEY_PTRS(&new_nodes[i]->key), b->c)) 1319 bkey_u64s(&new_nodes[i]->key)))
1312 goto out_nocoalesce; 1320 goto out_nocoalesce;
1313 1321
1314 bch_btree_node_write(new_nodes[i], &cl); 1322 bch_btree_node_write(new_nodes[i], &cl);
@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1316 } 1324 }
1317 1325
1318 for (i = 0; i < nodes; i++) { 1326 for (i = 0; i < nodes; i++) {
1319 if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) 1327 if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
1320 goto out_nocoalesce; 1328 goto out_nocoalesce;
1321 1329
1322 make_btree_freeing_key(r[i].b, keylist->top); 1330 make_btree_freeing_key(r[i].b, keylist->top);
@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1324 } 1332 }
1325 1333
1326 /* We emptied out this node */ 1334 /* We emptied out this node */
1327 BUG_ON(new_nodes[0]->sets->data->keys); 1335 BUG_ON(btree_bset_first(new_nodes[0])->keys);
1328 btree_node_free(new_nodes[0]); 1336 btree_node_free(new_nodes[0]);
1329 rw_unlock(true, new_nodes[0]); 1337 rw_unlock(true, new_nodes[0]);
1330 1338
@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b)
1370 struct btree_iter iter; 1378 struct btree_iter iter;
1371 unsigned ret = 0; 1379 unsigned ret = 0;
1372 1380
1373 for_each_key_filter(b, k, &iter, bch_ptr_bad) 1381 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1374 ret += bkey_u64s(k); 1382 ret += bkey_u64s(k);
1375 1383
1376 return ret; 1384 return ret;
@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1390 struct gc_merge_info *last = r + GC_MERGE_NODES - 1; 1398 struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
1391 1399
1392 bch_keylist_init(&keys); 1400 bch_keylist_init(&keys);
1393 bch_btree_iter_init(b, &iter, &b->c->gc_done); 1401 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1394 1402
1395 for (i = 0; i < GC_MERGE_NODES; i++) 1403 for (i = 0; i < GC_MERGE_NODES; i++)
1396 r[i].b = ERR_PTR(-EINTR); 1404 r[i].b = ERR_PTR(-EINTR);
1397 1405
1398 while (1) { 1406 while (1) {
1399 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1407 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1400 if (k) { 1408 if (k) {
1401 r->b = bch_btree_node_get(b->c, k, b->level - 1, true); 1409 r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1402 if (IS_ERR(r->b)) { 1410 if (IS_ERR(r->b)) {
@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1416 1424
1417 if (!IS_ERR(last->b)) { 1425 if (!IS_ERR(last->b)) {
1418 should_rewrite = btree_gc_mark_node(last->b, gc); 1426 should_rewrite = btree_gc_mark_node(last->b, gc);
1419 if (should_rewrite) { 1427 if (should_rewrite &&
1428 !btree_check_reserve(b, NULL)) {
1420 n = btree_node_alloc_replacement(last->b, 1429 n = btree_node_alloc_replacement(last->b,
1421 false); 1430 false);
1422 1431
@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1705 struct bucket *g; 1714 struct bucket *g;
1706 struct btree_iter iter; 1715 struct btree_iter iter;
1707 1716
1708 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1717 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1709 for (i = 0; i < KEY_PTRS(k); i++) { 1718 for (i = 0; i < KEY_PTRS(k); i++) {
1710 if (!ptr_available(b->c, k, i)) 1719 if (!ptr_available(b->c, k, i))
1711 continue; 1720 continue;
@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1728 } 1737 }
1729 1738
1730 if (b->level) { 1739 if (b->level) {
1731 bch_btree_iter_init(b, &iter, NULL); 1740 bch_btree_iter_init(&b->keys, &iter, NULL);
1732 1741
1733 do { 1742 do {
1734 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1743 k = bch_btree_iter_next_filter(&iter, &b->keys,
1744 bch_ptr_bad);
1735 if (k) 1745 if (k)
1736 btree_node_prefetch(b->c, k, b->level - 1); 1746 btree_node_prefetch(b->c, k, b->level - 1);
1737 1747
@@ -1774,235 +1784,36 @@ err:
1774 1784
1775/* Btree insertion */ 1785/* Btree insertion */
1776 1786
1777static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) 1787static bool btree_insert_key(struct btree *b, struct bkey *k,
1778{ 1788 struct bkey *replace_key)
1779 struct bset *i = b->sets[b->nsets].data;
1780
1781 memmove((uint64_t *) where + bkey_u64s(insert),
1782 where,
1783 (void *) end(i) - (void *) where);
1784
1785 i->keys += bkey_u64s(insert);
1786 bkey_copy(where, insert);
1787 bch_bset_fix_lookup_table(b, where);
1788}
1789
1790static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1791 struct btree_iter *iter,
1792 struct bkey *replace_key)
1793{ 1789{
1794 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1790 unsigned status;
1795 {
1796 if (KEY_DIRTY(k))
1797 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1798 offset, -sectors);
1799 }
1800
1801 uint64_t old_offset;
1802 unsigned old_size, sectors_found = 0;
1803
1804 while (1) {
1805 struct bkey *k = bch_btree_iter_next(iter);
1806 if (!k ||
1807 bkey_cmp(&START_KEY(k), insert) >= 0)
1808 break;
1809
1810 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1811 continue;
1812
1813 old_offset = KEY_START(k);
1814 old_size = KEY_SIZE(k);
1815
1816 /*
1817 * We might overlap with 0 size extents; we can't skip these
1818 * because if they're in the set we're inserting to we have to
1819 * adjust them so they don't overlap with the key we're
1820 * inserting. But we don't want to check them for replace
1821 * operations.
1822 */
1823
1824 if (replace_key && KEY_SIZE(k)) {
1825 /*
1826 * k might have been split since we inserted/found the
1827 * key we're replacing
1828 */
1829 unsigned i;
1830 uint64_t offset = KEY_START(k) -
1831 KEY_START(replace_key);
1832
1833 /* But it must be a subset of the replace key */
1834 if (KEY_START(k) < KEY_START(replace_key) ||
1835 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
1836 goto check_failed;
1837
1838 /* We didn't find a key that we were supposed to */
1839 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1840 goto check_failed;
1841
1842 if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
1843 KEY_DIRTY(k) != KEY_DIRTY(replace_key))
1844 goto check_failed;
1845
1846 /* skip past gen */
1847 offset <<= 8;
1848
1849 BUG_ON(!KEY_PTRS(replace_key));
1850 1791
1851 for (i = 0; i < KEY_PTRS(replace_key); i++) 1792 BUG_ON(bkey_cmp(k, &b->key) > 0);
1852 if (k->ptr[i] != replace_key->ptr[i] + offset)
1853 goto check_failed;
1854
1855 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1856 }
1857
1858 if (bkey_cmp(insert, k) < 0 &&
1859 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1860 /*
1861 * We overlapped in the middle of an existing key: that
1862 * means we have to split the old key. But we have to do
1863 * slightly different things depending on whether the
1864 * old key has been written out yet.
1865 */
1866
1867 struct bkey *top;
1868
1869 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1870
1871 if (bkey_written(b, k)) {
1872 /*
1873 * We insert a new key to cover the top of the
1874 * old key, and the old key is modified in place
1875 * to represent the bottom split.
1876 *
1877 * It's completely arbitrary whether the new key
1878 * is the top or the bottom, but it has to match
1879 * up with what btree_sort_fixup() does - it
1880 * doesn't check for this kind of overlap, it
1881 * depends on us inserting a new key for the top
1882 * here.
1883 */
1884 top = bch_bset_search(b, &b->sets[b->nsets],
1885 insert);
1886 shift_keys(b, top, k);
1887 } else {
1888 BKEY_PADDED(key) temp;
1889 bkey_copy(&temp.key, k);
1890 shift_keys(b, k, &temp.key);
1891 top = bkey_next(k);
1892 }
1893
1894 bch_cut_front(insert, top);
1895 bch_cut_back(&START_KEY(insert), k);
1896 bch_bset_fix_invalidated_key(b, k);
1897 return false;
1898 }
1899
1900 if (bkey_cmp(insert, k) < 0) {
1901 bch_cut_front(insert, k);
1902 } else {
1903 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1904 old_offset = KEY_START(insert);
1905
1906 if (bkey_written(b, k) &&
1907 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1908 /*
1909 * Completely overwrote, so we don't have to
1910 * invalidate the binary search tree
1911 */
1912 bch_cut_front(k, k);
1913 } else {
1914 __bch_cut_back(&START_KEY(insert), k);
1915 bch_bset_fix_invalidated_key(b, k);
1916 }
1917 }
1918
1919 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1920 }
1921 1793
1922check_failed: 1794 status = bch_btree_insert_key(&b->keys, k, replace_key);
1923 if (replace_key) { 1795 if (status != BTREE_INSERT_STATUS_NO_INSERT) {
1924 if (!sectors_found) { 1796 bch_check_keys(&b->keys, "%u for %s", status,
1925 return true; 1797 replace_key ? "replace" : "insert");
1926 } else if (sectors_found < KEY_SIZE(insert)) {
1927 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1928 (KEY_SIZE(insert) - sectors_found));
1929 SET_KEY_SIZE(insert, sectors_found);
1930 }
1931 }
1932 1798
1933 return false; 1799 trace_bcache_btree_insert_key(b, k, replace_key != NULL,
1800 status);
1801 return true;
1802 } else
1803 return false;
1934} 1804}
1935 1805
1936static bool btree_insert_key(struct btree *b, struct btree_op *op, 1806static size_t insert_u64s_remaining(struct btree *b)
1937 struct bkey *k, struct bkey *replace_key)
1938{ 1807{
1939 struct bset *i = b->sets[b->nsets].data; 1808 long ret = bch_btree_keys_u64s_remaining(&b->keys);
1940 struct bkey *m, *prev;
1941 unsigned status = BTREE_INSERT_STATUS_INSERT;
1942
1943 BUG_ON(bkey_cmp(k, &b->key) > 0);
1944 BUG_ON(b->level && !KEY_PTRS(k));
1945 BUG_ON(!b->level && !KEY_OFFSET(k));
1946
1947 if (!b->level) {
1948 struct btree_iter iter;
1949
1950 /*
1951 * bset_search() returns the first key that is strictly greater
1952 * than the search key - but for back merging, we want to find
1953 * the previous key.
1954 */
1955 prev = NULL;
1956 m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
1957 1809
1958 if (fix_overlapping_extents(b, k, &iter, replace_key)) { 1810 /*
1959 op->insert_collision = true; 1811 * Might land in the middle of an existing extent and have to split it
1960 return false; 1812 */
1961 } 1813 if (b->keys.ops->is_extents)
1962 1814 ret -= KEY_MAX_U64S;
1963 if (KEY_DIRTY(k))
1964 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1965 KEY_START(k), KEY_SIZE(k));
1966
1967 while (m != end(i) &&
1968 bkey_cmp(k, &START_KEY(m)) > 0)
1969 prev = m, m = bkey_next(m);
1970
1971 if (key_merging_disabled(b->c))
1972 goto insert;
1973
1974 /* prev is in the tree, if we merge we're done */
1975 status = BTREE_INSERT_STATUS_BACK_MERGE;
1976 if (prev &&
1977 bch_bkey_try_merge(b, prev, k))
1978 goto merged;
1979
1980 status = BTREE_INSERT_STATUS_OVERWROTE;
1981 if (m != end(i) &&
1982 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1983 goto copy;
1984
1985 status = BTREE_INSERT_STATUS_FRONT_MERGE;
1986 if (m != end(i) &&
1987 bch_bkey_try_merge(b, k, m))
1988 goto copy;
1989 } else {
1990 BUG_ON(replace_key);
1991 m = bch_bset_search(b, &b->sets[b->nsets], k);
1992 }
1993
1994insert: shift_keys(b, m, k);
1995copy: bkey_copy(m, k);
1996merged:
1997 bch_check_keys(b, "%u for %s", status,
1998 replace_key ? "replace" : "insert");
1999
2000 if (b->level && !KEY_OFFSET(k))
2001 btree_current_write(b)->prio_blocked++;
2002
2003 trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
2004 1815
2005 return true; 1816 return max(ret, 0L);
2006} 1817}
2007 1818
2008static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, 1819static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2010 struct bkey *replace_key) 1821 struct bkey *replace_key)
2011{ 1822{
2012 bool ret = false; 1823 bool ret = false;
2013 int oldsize = bch_count_data(b); 1824 int oldsize = bch_count_data(&b->keys);
2014 1825
2015 while (!bch_keylist_empty(insert_keys)) { 1826 while (!bch_keylist_empty(insert_keys)) {
2016 struct bset *i = write_block(b);
2017 struct bkey *k = insert_keys->keys; 1827 struct bkey *k = insert_keys->keys;
2018 1828
2019 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) 1829 if (bkey_u64s(k) > insert_u64s_remaining(b))
2020 > btree_blocks(b))
2021 break; 1830 break;
2022 1831
2023 if (bkey_cmp(k, &b->key) <= 0) { 1832 if (bkey_cmp(k, &b->key) <= 0) {
2024 if (!b->level) 1833 if (!b->level)
2025 bkey_put(b->c, k); 1834 bkey_put(b->c, k);
2026 1835
2027 ret |= btree_insert_key(b, op, k, replace_key); 1836 ret |= btree_insert_key(b, k, replace_key);
2028 bch_keylist_pop_front(insert_keys); 1837 bch_keylist_pop_front(insert_keys);
2029 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 1838 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2030 BKEY_PADDED(key) temp; 1839 BKEY_PADDED(key) temp;
@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2033 bch_cut_back(&b->key, &temp.key); 1842 bch_cut_back(&b->key, &temp.key);
2034 bch_cut_front(&b->key, insert_keys->keys); 1843 bch_cut_front(&b->key, insert_keys->keys);
2035 1844
2036 ret |= btree_insert_key(b, op, &temp.key, replace_key); 1845 ret |= btree_insert_key(b, &temp.key, replace_key);
2037 break; 1846 break;
2038 } else { 1847 } else {
2039 break; 1848 break;
2040 } 1849 }
2041 } 1850 }
2042 1851
1852 if (!ret)
1853 op->insert_collision = true;
1854
2043 BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 1855 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2044 1856
2045 BUG_ON(bch_count_data(b) < oldsize); 1857 BUG_ON(bch_count_data(&b->keys) < oldsize);
2046 return ret; 1858 return ret;
2047} 1859}
2048 1860
@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
2059 closure_init_stack(&cl); 1871 closure_init_stack(&cl);
2060 bch_keylist_init(&parent_keys); 1872 bch_keylist_init(&parent_keys);
2061 1873
1874 if (!b->level &&
1875 btree_check_reserve(b, op))
1876 return -EINTR;
1877
2062 n1 = btree_node_alloc_replacement(b, true); 1878 n1 = btree_node_alloc_replacement(b, true);
2063 if (IS_ERR(n1)) 1879 if (IS_ERR(n1))
2064 goto err; 1880 goto err;
2065 1881
2066 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1882 split = set_blocks(btree_bset_first(n1),
1883 block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
2067 1884
2068 if (split) { 1885 if (split) {
2069 unsigned keys = 0; 1886 unsigned keys = 0;
2070 1887
2071 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 1888 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
2072 1889
2073 n2 = bch_btree_node_alloc(b->c, b->level, true); 1890 n2 = bch_btree_node_alloc(b->c, b->level, true);
2074 if (IS_ERR(n2)) 1891 if (IS_ERR(n2))
@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op,
2087 * search tree yet 1904 * search tree yet
2088 */ 1905 */
2089 1906
2090 while (keys < (n1->sets[0].data->keys * 3) / 5) 1907 while (keys < (btree_bset_first(n1)->keys * 3) / 5)
2091 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1908 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
1909 keys));
2092 1910
2093 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); 1911 bkey_copy_key(&n1->key,
2094 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1912 bset_bkey_idx(btree_bset_first(n1), keys));
1913 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
2095 1914
2096 n2->sets[0].data->keys = n1->sets[0].data->keys - keys; 1915 btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
2097 n1->sets[0].data->keys = keys; 1916 btree_bset_first(n1)->keys = keys;
2098 1917
2099 memcpy(n2->sets[0].data->start, 1918 memcpy(btree_bset_first(n2)->start,
2100 end(n1->sets[0].data), 1919 bset_bkey_last(btree_bset_first(n1)),
2101 n2->sets[0].data->keys * sizeof(uint64_t)); 1920 btree_bset_first(n2)->keys * sizeof(uint64_t));
2102 1921
2103 bkey_copy_key(&n2->key, &b->key); 1922 bkey_copy_key(&n2->key, &b->key);
2104 1923
@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
2106 bch_btree_node_write(n2, &cl); 1925 bch_btree_node_write(n2, &cl);
2107 rw_unlock(true, n2); 1926 rw_unlock(true, n2);
2108 } else { 1927 } else {
2109 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 1928 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
2110 1929
2111 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 1930 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2112 } 1931 }
@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
2149 1968
2150 return 0; 1969 return 0;
2151err_free2: 1970err_free2:
1971 bkey_put(b->c, &n2->key);
2152 btree_node_free(n2); 1972 btree_node_free(n2);
2153 rw_unlock(true, n2); 1973 rw_unlock(true, n2);
2154err_free1: 1974err_free1:
1975 bkey_put(b->c, &n1->key);
2155 btree_node_free(n1); 1976 btree_node_free(n1);
2156 rw_unlock(true, n1); 1977 rw_unlock(true, n1);
2157err: 1978err:
1979 WARN(1, "bcache: btree split failed");
1980
2158 if (n3 == ERR_PTR(-EAGAIN) || 1981 if (n3 == ERR_PTR(-EAGAIN) ||
2159 n2 == ERR_PTR(-EAGAIN) || 1982 n2 == ERR_PTR(-EAGAIN) ||
2160 n1 == ERR_PTR(-EAGAIN)) 1983 n1 == ERR_PTR(-EAGAIN))
2161 return -EAGAIN; 1984 return -EAGAIN;
2162 1985
2163 pr_warn("couldn't split");
2164 return -ENOMEM; 1986 return -ENOMEM;
2165} 1987}
2166 1988
@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2171{ 1993{
2172 BUG_ON(b->level && replace_key); 1994 BUG_ON(b->level && replace_key);
2173 1995
2174 if (should_split(b)) { 1996 if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
2175 if (current->bio_list) { 1997 if (current->bio_list) {
2176 op->lock = b->c->root->level + 1; 1998 op->lock = b->c->root->level + 1;
2177 return -EAGAIN; 1999 return -EAGAIN;
@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2180 return -EINTR; 2002 return -EINTR;
2181 } else { 2003 } else {
2182 /* Invalidated all iterators */ 2004 /* Invalidated all iterators */
2183 return btree_split(b, op, insert_keys, replace_key) ?: 2005 int ret = btree_split(b, op, insert_keys, replace_key);
2184 -EINTR; 2006
2007 return bch_keylist_empty(insert_keys) ?
2008 0 : ret ?: -EINTR;
2185 } 2009 }
2186 } else { 2010 } else {
2187 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2011 BUG_ON(write_block(b) != btree_bset_last(b));
2188 2012
2189 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 2013 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2190 if (!b->level) 2014 if (!b->level)
@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2323 struct bkey *k; 2147 struct bkey *k;
2324 struct btree_iter iter; 2148 struct btree_iter iter;
2325 2149
2326 bch_btree_iter_init(b, &iter, from); 2150 bch_btree_iter_init(&b->keys, &iter, from);
2327 2151
2328 while ((k = bch_btree_iter_next_filter(&iter, b, 2152 while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2329 bch_ptr_bad))) { 2153 bch_ptr_bad))) {
2330 ret = btree(map_nodes_recurse, k, b, 2154 ret = btree(map_nodes_recurse, k, b,
2331 op, from, fn, flags); 2155 op, from, fn, flags);
@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2356 struct bkey *k; 2180 struct bkey *k;
2357 struct btree_iter iter; 2181 struct btree_iter iter;
2358 2182
2359 bch_btree_iter_init(b, &iter, from); 2183 bch_btree_iter_init(&b->keys, &iter, from);
2360 2184
2361 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { 2185 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2362 ret = !b->level 2186 ret = !b->level
2363 ? fn(op, b, k) 2187 ? fn(op, b, k)
2364 : btree(map_keys_recurse, k, b, op, from, fn, flags); 2188 : btree(map_keys_recurse, k, b, op, from, fn, flags);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 767e75570896..af065e97e55c 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -130,20 +130,12 @@ struct btree {
130 unsigned long flags; 130 unsigned long flags;
131 uint16_t written; /* would be nice to kill */ 131 uint16_t written; /* would be nice to kill */
132 uint8_t level; 132 uint8_t level;
133 uint8_t nsets; 133
134 uint8_t page_order; 134 struct btree_keys keys;
135
136 /*
137 * Set of sorted keys - the real btree node - plus a binary search tree
138 *
139 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
140 * to the memory we have allocated for this btree node. Additionally,
141 * set[0]->data points to the entire btree node as it exists on disk.
142 */
143 struct bset_tree sets[MAX_BSETS];
144 135
145 /* For outstanding btree writes, used as a lock - protects write_idx */ 136 /* For outstanding btree writes, used as a lock - protects write_idx */
146 struct closure_with_waitlist io; 137 struct closure io;
138 struct semaphore io_mutex;
147 139
148 struct list_head list; 140 struct list_head list;
149 struct delayed_work work; 141 struct delayed_work work;
@@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
179 return b->writes + (btree_node_write_idx(b) ^ 1); 171 return b->writes + (btree_node_write_idx(b) ^ 1);
180} 172}
181 173
182static inline unsigned bset_offset(struct btree *b, struct bset *i) 174static inline struct bset *btree_bset_first(struct btree *b)
183{ 175{
184 return (((size_t) i) - ((size_t) b->sets->data)) >> 9; 176 return b->keys.set->data;
185} 177}
186 178
187static inline struct bset *write_block(struct btree *b) 179static inline struct bset *btree_bset_last(struct btree *b)
188{ 180{
189 return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); 181 return bset_tree_last(&b->keys)->data;
190} 182}
191 183
192static inline bool bset_written(struct btree *b, struct bset_tree *t) 184static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
193{ 185{
194 return t->data < write_block(b); 186 return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
195}
196
197static inline bool bkey_written(struct btree *b, struct bkey *k)
198{
199 return k < write_block(b)->start;
200} 187}
201 188
202static inline void set_gc_sectors(struct cache_set *c) 189static inline void set_gc_sectors(struct cache_set *c)
@@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c)
204 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); 191 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
205} 192}
206 193
207static inline struct bkey *bch_btree_iter_init(struct btree *b,
208 struct btree_iter *iter,
209 struct bkey *search)
210{
211 return __bch_btree_iter_init(b, iter, search, b->sets);
212}
213
214static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
215{
216 if (b->level)
217 return bch_btree_ptr_invalid(b->c, k);
218 else
219 return bch_extent_ptr_invalid(b->c, k);
220}
221
222void bkey_put(struct cache_set *c, struct bkey *k); 194void bkey_put(struct cache_set *c, struct bkey *k);
223 195
224/* Looping macros */ 196/* Looping macros */
@@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k);
229 iter++) \ 201 iter++) \
230 hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) 202 hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
231 203
232#define for_each_key_filter(b, k, iter, filter) \
233 for (bch_btree_iter_init((b), (iter), NULL); \
234 ((k) = bch_btree_iter_next_filter((iter), b, filter));)
235
236#define for_each_key(b, k, iter) \
237 for (bch_btree_iter_init((b), (iter), NULL); \
238 ((k) = bch_btree_iter_next(iter));)
239
240/* Recursing down the btree */ 204/* Recursing down the btree */
241 205
242struct btree_op { 206struct btree_op {
207 /* for waiting on btree reserve in btree_split() */
208 wait_queue_t wait;
209
243 /* Btree level at which we start taking write locks */ 210 /* Btree level at which we start taking write locks */
244 short lock; 211 short lock;
245 212
@@ -249,6 +216,7 @@ struct btree_op {
249static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) 216static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
250{ 217{
251 memset(op, 0, sizeof(struct btree_op)); 218 memset(op, 0, sizeof(struct btree_op));
219 init_wait(&op->wait);
252 op->lock = write_lock_level; 220 op->lock = write_lock_level;
253} 221}
254 222
@@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b)
267 (w ? up_write : up_read)(&b->lock); 235 (w ? up_write : up_read)(&b->lock);
268} 236}
269 237
270void bch_btree_node_read(struct btree *); 238void bch_btree_node_read_done(struct btree *);
271void bch_btree_node_write(struct btree *, struct closure *); 239void bch_btree_node_write(struct btree *, struct closure *);
272 240
273void bch_btree_set_root(struct btree *); 241void bch_btree_set_root(struct btree *);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index dfff2410322e..7a228de95fd7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -11,19 +11,6 @@
11 11
12#include "closure.h" 12#include "closure.h"
13 13
14#define CL_FIELD(type, field) \
15 case TYPE_ ## type: \
16 return &container_of(cl, struct type, cl)->field
17
18static struct closure_waitlist *closure_waitlist(struct closure *cl)
19{
20 switch (cl->type) {
21 CL_FIELD(closure_with_waitlist, wait);
22 default:
23 return NULL;
24 }
25}
26
27static inline void closure_put_after_sub(struct closure *cl, int flags) 14static inline void closure_put_after_sub(struct closure *cl, int flags)
28{ 15{
29 int r = flags & CLOSURE_REMAINING_MASK; 16 int r = flags & CLOSURE_REMAINING_MASK;
@@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
42 closure_queue(cl); 29 closure_queue(cl);
43 } else { 30 } else {
44 struct closure *parent = cl->parent; 31 struct closure *parent = cl->parent;
45 struct closure_waitlist *wait = closure_waitlist(cl);
46 closure_fn *destructor = cl->fn; 32 closure_fn *destructor = cl->fn;
47 33
48 closure_debug_destroy(cl); 34 closure_debug_destroy(cl);
49 35
50 smp_mb();
51 atomic_set(&cl->remaining, -1);
52
53 if (wait)
54 closure_wake_up(wait);
55
56 if (destructor) 36 if (destructor)
57 destructor(cl); 37 destructor(cl);
58 38
@@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v)
69} 49}
70EXPORT_SYMBOL(closure_sub); 50EXPORT_SYMBOL(closure_sub);
71 51
52/**
53 * closure_put - decrement a closure's refcount
54 */
72void closure_put(struct closure *cl) 55void closure_put(struct closure *cl)
73{ 56{
74 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); 57 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
75} 58}
76EXPORT_SYMBOL(closure_put); 59EXPORT_SYMBOL(closure_put);
77 60
78static void set_waiting(struct closure *cl, unsigned long f) 61/**
79{ 62 * closure_wake_up - wake up all closures on a wait list, without memory barrier
80#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 63 */
81 cl->waiting_on = f;
82#endif
83}
84
85void __closure_wake_up(struct closure_waitlist *wait_list) 64void __closure_wake_up(struct closure_waitlist *wait_list)
86{ 65{
87 struct llist_node *list; 66 struct llist_node *list;
@@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
106 cl = container_of(reverse, struct closure, list); 85 cl = container_of(reverse, struct closure, list);
107 reverse = llist_next(reverse); 86 reverse = llist_next(reverse);
108 87
109 set_waiting(cl, 0); 88 closure_set_waiting(cl, 0);
110 closure_sub(cl, CLOSURE_WAITING + 1); 89 closure_sub(cl, CLOSURE_WAITING + 1);
111 } 90 }
112} 91}
113EXPORT_SYMBOL(__closure_wake_up); 92EXPORT_SYMBOL(__closure_wake_up);
114 93
115bool closure_wait(struct closure_waitlist *list, struct closure *cl) 94/**
95 * closure_wait - add a closure to a waitlist
96 *
97 * @waitlist will own a ref on @cl, which will be released when
98 * closure_wake_up() is called on @waitlist.
99 *
100 */
101bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
116{ 102{
117 if (atomic_read(&cl->remaining) & CLOSURE_WAITING) 103 if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
118 return false; 104 return false;
119 105
120 set_waiting(cl, _RET_IP_); 106 closure_set_waiting(cl, _RET_IP_);
121 atomic_add(CLOSURE_WAITING + 1, &cl->remaining); 107 atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
122 llist_add(&cl->list, &list->list); 108 llist_add(&cl->list, &waitlist->list);
123 109
124 return true; 110 return true;
125} 111}
126EXPORT_SYMBOL(closure_wait); 112EXPORT_SYMBOL(closure_wait);
127 113
128/** 114/**
129 * closure_sync() - sleep until a closure a closure has nothing left to wait on 115 * closure_sync - sleep until a closure a closure has nothing left to wait on
130 * 116 *
131 * Sleeps until the refcount hits 1 - the thread that's running the closure owns 117 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
132 * the last refcount. 118 * the last refcount.
@@ -148,46 +134,6 @@ void closure_sync(struct closure *cl)
148} 134}
149EXPORT_SYMBOL(closure_sync); 135EXPORT_SYMBOL(closure_sync);
150 136
151/**
152 * closure_trylock() - try to acquire the closure, without waiting
153 * @cl: closure to lock
154 *
155 * Returns true if the closure was succesfully locked.
156 */
157bool closure_trylock(struct closure *cl, struct closure *parent)
158{
159 if (atomic_cmpxchg(&cl->remaining, -1,
160 CLOSURE_REMAINING_INITIALIZER) != -1)
161 return false;
162
163 smp_mb();
164
165 cl->parent = parent;
166 if (parent)
167 closure_get(parent);
168
169 closure_set_ret_ip(cl);
170 closure_debug_create(cl);
171 return true;
172}
173EXPORT_SYMBOL(closure_trylock);
174
175void __closure_lock(struct closure *cl, struct closure *parent,
176 struct closure_waitlist *wait_list)
177{
178 struct closure wait;
179 closure_init_stack(&wait);
180
181 while (1) {
182 if (closure_trylock(cl, parent))
183 return;
184
185 closure_wait_event(wait_list, &wait,
186 atomic_read(&cl->remaining) == -1);
187 }
188}
189EXPORT_SYMBOL(__closure_lock);
190
191#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 137#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
192 138
193static LIST_HEAD(closure_list); 139static LIST_HEAD(closure_list);
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 9762f1be3304..7ef7461912be 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -72,30 +72,6 @@
72 * closure - _always_ use continue_at(). Doing so consistently will help 72 * closure - _always_ use continue_at(). Doing so consistently will help
73 * eliminate an entire class of particularly pernicious races. 73 * eliminate an entire class of particularly pernicious races.
74 * 74 *
75 * For a closure to wait on an arbitrary event, we need to introduce waitlists:
76 *
77 * struct closure_waitlist list;
78 * closure_wait_event(list, cl, condition);
79 * closure_wake_up(wait_list);
80 *
81 * These work analagously to wait_event() and wake_up() - except that instead of
82 * operating on the current thread (for wait_event()) and lists of threads, they
83 * operate on an explicit closure and lists of closures.
84 *
85 * Because it's a closure we can now wait either synchronously or
86 * asynchronously. closure_wait_event() returns the current value of the
87 * condition, and if it returned false continue_at() or closure_sync() can be
88 * used to wait for it to become true.
89 *
90 * It's useful for waiting on things when you can't sleep in the context in
91 * which you must check the condition (perhaps a spinlock held, or you might be
92 * beneath generic_make_request() - in which case you can't sleep on IO).
93 *
94 * closure_wait_event() will wait either synchronously or asynchronously,
95 * depending on whether the closure is in blocking mode or not. You can pick a
96 * mode explicitly with closure_wait_event_sync() and
97 * closure_wait_event_async(), which do just what you might expect.
98 *
99 * Lastly, you might have a wait list dedicated to a specific event, and have no 75 * Lastly, you might have a wait list dedicated to a specific event, and have no
100 * need for specifying the condition - you just want to wait until someone runs 76 * need for specifying the condition - you just want to wait until someone runs
101 * closure_wake_up() on the appropriate wait list. In that case, just use 77 * closure_wake_up() on the appropriate wait list. In that case, just use
@@ -121,40 +97,6 @@
121 * All this implies that a closure should typically be embedded in a particular 97 * All this implies that a closure should typically be embedded in a particular
122 * struct (which its refcount will normally control the lifetime of), and that 98 * struct (which its refcount will normally control the lifetime of), and that
123 * struct can very much be thought of as a stack frame. 99 * struct can very much be thought of as a stack frame.
124 *
125 * Locking:
126 *
127 * Closures are based on work items but they can be thought of as more like
128 * threads - in that like threads and unlike work items they have a well
129 * defined lifetime; they are created (with closure_init()) and eventually
130 * complete after a continue_at(cl, NULL, NULL).
131 *
132 * Suppose you've got some larger structure with a closure embedded in it that's
133 * used for periodically doing garbage collection. You only want one garbage
134 * collection happening at a time, so the natural thing to do is protect it with
135 * a lock. However, it's difficult to use a lock protecting a closure correctly
136 * because the unlock should come after the last continue_to() (additionally, if
137 * you're using the closure asynchronously a mutex won't work since a mutex has
138 * to be unlocked by the same process that locked it).
139 *
140 * So to make it less error prone and more efficient, we also have the ability
141 * to use closures as locks:
142 *
143 * closure_init_unlocked();
144 * closure_trylock();
145 *
146 * That's all we need for trylock() - the last closure_put() implicitly unlocks
147 * it for you. But for closure_lock(), we also need a wait list:
148 *
149 * struct closure_with_waitlist frobnicator_cl;
150 *
151 * closure_init_unlocked(&frobnicator_cl);
152 * closure_lock(&frobnicator_cl);
153 *
154 * A closure_with_waitlist embeds a closure and a wait list - much like struct
155 * delayed_work embeds a work item and a timer_list. The important thing is, use
156 * it exactly like you would a regular closure and closure_put() will magically
157 * handle everything for you.
158 */ 100 */
159 101
160struct closure; 102struct closure;
@@ -164,12 +106,6 @@ struct closure_waitlist {
164 struct llist_head list; 106 struct llist_head list;
165}; 107};
166 108
167enum closure_type {
168 TYPE_closure = 0,
169 TYPE_closure_with_waitlist = 1,
170 MAX_CLOSURE_TYPE = 1,
171};
172
173enum closure_state { 109enum closure_state {
174 /* 110 /*
175 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by 111 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
@@ -224,8 +160,6 @@ struct closure {
224 160
225 atomic_t remaining; 161 atomic_t remaining;
226 162
227 enum closure_type type;
228
229#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 163#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
230#define CLOSURE_MAGIC_DEAD 0xc054dead 164#define CLOSURE_MAGIC_DEAD 0xc054dead
231#define CLOSURE_MAGIC_ALIVE 0xc054a11e 165#define CLOSURE_MAGIC_ALIVE 0xc054a11e
@@ -237,34 +171,12 @@ struct closure {
237#endif 171#endif
238}; 172};
239 173
240struct closure_with_waitlist {
241 struct closure cl;
242 struct closure_waitlist wait;
243};
244
245extern unsigned invalid_closure_type(void);
246
247#define __CLOSURE_TYPE(cl, _t) \
248 __builtin_types_compatible_p(typeof(cl), struct _t) \
249 ? TYPE_ ## _t : \
250
251#define __closure_type(cl) \
252( \
253 __CLOSURE_TYPE(cl, closure) \
254 __CLOSURE_TYPE(cl, closure_with_waitlist) \
255 invalid_closure_type() \
256)
257
258void closure_sub(struct closure *cl, int v); 174void closure_sub(struct closure *cl, int v);
259void closure_put(struct closure *cl); 175void closure_put(struct closure *cl);
260void __closure_wake_up(struct closure_waitlist *list); 176void __closure_wake_up(struct closure_waitlist *list);
261bool closure_wait(struct closure_waitlist *list, struct closure *cl); 177bool closure_wait(struct closure_waitlist *list, struct closure *cl);
262void closure_sync(struct closure *cl); 178void closure_sync(struct closure *cl);
263 179
264bool closure_trylock(struct closure *cl, struct closure *parent);
265void __closure_lock(struct closure *cl, struct closure *parent,
266 struct closure_waitlist *wait_list);
267
268#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 180#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
269 181
270void closure_debug_init(void); 182void closure_debug_init(void);
@@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl)
293#endif 205#endif
294} 206}
295 207
296static inline void closure_get(struct closure *cl) 208static inline void closure_set_waiting(struct closure *cl, unsigned long f)
297{ 209{
298#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 210#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
299 BUG_ON((atomic_inc_return(&cl->remaining) & 211 cl->waiting_on = f;
300 CLOSURE_REMAINING_MASK) <= 1);
301#else
302 atomic_inc(&cl->remaining);
303#endif 212#endif
304} 213}
305 214
306static inline void closure_set_stopped(struct closure *cl) 215static inline void __closure_end_sleep(struct closure *cl)
307{ 216{
308 atomic_sub(CLOSURE_RUNNING, &cl->remaining); 217 __set_current_state(TASK_RUNNING);
218
219 if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
220 atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
309} 221}
310 222
311static inline bool closure_is_unlocked(struct closure *cl) 223static inline void __closure_start_sleep(struct closure *cl)
312{ 224{
313 return atomic_read(&cl->remaining) == -1; 225 closure_set_ip(cl);
226 cl->task = current;
227 set_current_state(TASK_UNINTERRUPTIBLE);
228
229 if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
230 atomic_add(CLOSURE_SLEEPING, &cl->remaining);
314} 231}
315 232
316static inline void do_closure_init(struct closure *cl, struct closure *parent, 233static inline void closure_set_stopped(struct closure *cl)
317 bool running)
318{ 234{
319 cl->parent = parent; 235 atomic_sub(CLOSURE_RUNNING, &cl->remaining);
320 if (parent) 236}
321 closure_get(parent);
322
323 if (running) {
324 closure_debug_create(cl);
325 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
326 } else
327 atomic_set(&cl->remaining, -1);
328 237
238static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
239 struct workqueue_struct *wq)
240{
241 BUG_ON(object_is_on_stack(cl));
329 closure_set_ip(cl); 242 closure_set_ip(cl);
243 cl->fn = fn;
244 cl->wq = wq;
245 /* between atomic_dec() in closure_put() */
246 smp_mb__before_atomic_dec();
330} 247}
331 248
332/* 249static inline void closure_queue(struct closure *cl)
333 * Hack to get at the embedded closure if there is one, by doing an unsafe cast: 250{
334 * the result of __closure_type() is thrown away, it's used merely for type 251 struct workqueue_struct *wq = cl->wq;
335 * checking. 252 if (wq) {
336 */ 253 INIT_WORK(&cl->work, cl->work.func);
337#define __to_internal_closure(cl) \ 254 BUG_ON(!queue_work(wq, &cl->work));
338({ \ 255 } else
339 BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ 256 cl->fn(cl);
340 (struct closure *) cl; \ 257}
341})
342
343#define closure_init_type(cl, parent, running) \
344do { \
345 struct closure *_cl = __to_internal_closure(cl); \
346 _cl->type = __closure_type(*(cl)); \
347 do_closure_init(_cl, parent, running); \
348} while (0)
349 258
350/** 259/**
351 * __closure_init() - Initialize a closure, skipping the memset() 260 * closure_get - increment a closure's refcount
352 *
353 * May be used instead of closure_init() when memory has already been zeroed.
354 */ 261 */
355#define __closure_init(cl, parent) \ 262static inline void closure_get(struct closure *cl)
356 closure_init_type(cl, parent, true) 263{
264#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
265 BUG_ON((atomic_inc_return(&cl->remaining) &
266 CLOSURE_REMAINING_MASK) <= 1);
267#else
268 atomic_inc(&cl->remaining);
269#endif
270}
357 271
358/** 272/**
359 * closure_init() - Initialize a closure, setting the refcount to 1 273 * closure_init - Initialize a closure, setting the refcount to 1
360 * @cl: closure to initialize 274 * @cl: closure to initialize
361 * @parent: parent of the new closure. cl will take a refcount on it for its 275 * @parent: parent of the new closure. cl will take a refcount on it for its
362 * lifetime; may be NULL. 276 * lifetime; may be NULL.
363 */ 277 */
364#define closure_init(cl, parent) \ 278static inline void closure_init(struct closure *cl, struct closure *parent)
365do { \
366 memset((cl), 0, sizeof(*(cl))); \
367 __closure_init(cl, parent); \
368} while (0)
369
370static inline void closure_init_stack(struct closure *cl)
371{ 279{
372 memset(cl, 0, sizeof(struct closure)); 280 memset(cl, 0, sizeof(struct closure));
373 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); 281 cl->parent = parent;
374} 282 if (parent)
375 283 closure_get(parent);
376/**
377 * closure_init_unlocked() - Initialize a closure but leave it unlocked.
378 * @cl: closure to initialize
379 *
380 * For when the closure will be used as a lock. The closure may not be used
381 * until after a closure_lock() or closure_trylock().
382 */
383#define closure_init_unlocked(cl) \
384do { \
385 memset((cl), 0, sizeof(*(cl))); \
386 closure_init_type(cl, NULL, false); \
387} while (0)
388
389/**
390 * closure_lock() - lock and initialize a closure.
391 * @cl: the closure to lock
392 * @parent: the new parent for this closure
393 *
394 * The closure must be of one of the types that has a waitlist (otherwise we
395 * wouldn't be able to sleep on contention).
396 *
397 * @parent has exactly the same meaning as in closure_init(); if non null, the
398 * closure will take a reference on @parent which will be released when it is
399 * unlocked.
400 */
401#define closure_lock(cl, parent) \
402 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
403 284
404static inline void __closure_end_sleep(struct closure *cl) 285 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
405{
406 __set_current_state(TASK_RUNNING);
407 286
408 if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) 287 closure_debug_create(cl);
409 atomic_sub(CLOSURE_SLEEPING, &cl->remaining); 288 closure_set_ip(cl);
410} 289}
411 290
412static inline void __closure_start_sleep(struct closure *cl) 291static inline void closure_init_stack(struct closure *cl)
413{ 292{
414 closure_set_ip(cl); 293 memset(cl, 0, sizeof(struct closure));
415 cl->task = current; 294 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
416 set_current_state(TASK_UNINTERRUPTIBLE);
417
418 if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
419 atomic_add(CLOSURE_SLEEPING, &cl->remaining);
420} 295}
421 296
422/** 297/**
423 * closure_wake_up() - wake up all closures on a wait list. 298 * closure_wake_up - wake up all closures on a wait list.
424 */ 299 */
425static inline void closure_wake_up(struct closure_waitlist *list) 300static inline void closure_wake_up(struct closure_waitlist *list)
426{ 301{
@@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list)
428 __closure_wake_up(list); 303 __closure_wake_up(list);
429} 304}
430 305
431/* 306/**
432 * Wait on an event, synchronously or asynchronously - analogous to wait_event() 307 * continue_at - jump to another function with barrier
433 * but for closures. 308 *
434 * 309 * After @cl is no longer waiting on anything (i.e. all outstanding refs have
435 * The loop is oddly structured so as to avoid a race; we must check the 310 * been dropped with closure_put()), it will resume execution at @fn running out
436 * condition again after we've added ourself to the waitlist. We know if we were 311 * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
437 * already on the waitlist because closure_wait() returns false; thus, we only 312 *
438 * schedule or break if closure_wait() returns false. If it returns true, we 313 * NOTE: This macro expands to a return in the calling function!
439 * just loop again - rechecking the condition. 314 *
440 * 315 * This is because after calling continue_at() you no longer have a ref on @cl,
441 * The __closure_wake_up() is necessary because we may race with the event 316 * and whatever @cl owns may be freed out from under you - a running closure fn
442 * becoming true; i.e. we see event false -> wait -> recheck condition, but the 317 * has a ref on its own closure which continue_at() drops.
443 * thread that made the event true may have called closure_wake_up() before we
444 * added ourself to the wait list.
445 *
446 * We have to call closure_sync() at the end instead of just
447 * __closure_end_sleep() because a different thread might've called
448 * closure_wake_up() before us and gotten preempted before they dropped the
449 * refcount on our closure. If this was a stack allocated closure, that would be
450 * bad.
451 */ 318 */
452#define closure_wait_event(list, cl, condition) \
453({ \
454 typeof(condition) ret; \
455 \
456 while (1) { \
457 ret = (condition); \
458 if (ret) { \
459 __closure_wake_up(list); \
460 closure_sync(cl); \
461 break; \
462 } \
463 \
464 __closure_start_sleep(cl); \
465 \
466 if (!closure_wait(list, cl)) \
467 schedule(); \
468 } \
469 \
470 ret; \
471})
472
473static inline void closure_queue(struct closure *cl)
474{
475 struct workqueue_struct *wq = cl->wq;
476 if (wq) {
477 INIT_WORK(&cl->work, cl->work.func);
478 BUG_ON(!queue_work(wq, &cl->work));
479 } else
480 cl->fn(cl);
481}
482
483static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
484 struct workqueue_struct *wq)
485{
486 BUG_ON(object_is_on_stack(cl));
487 closure_set_ip(cl);
488 cl->fn = fn;
489 cl->wq = wq;
490 /* between atomic_dec() in closure_put() */
491 smp_mb__before_atomic_dec();
492}
493
494#define continue_at(_cl, _fn, _wq) \ 319#define continue_at(_cl, _fn, _wq) \
495do { \ 320do { \
496 set_closure_fn(_cl, _fn, _wq); \ 321 set_closure_fn(_cl, _fn, _wq); \
@@ -498,8 +323,28 @@ do { \
498 return; \ 323 return; \
499} while (0) 324} while (0)
500 325
326/**
327 * closure_return - finish execution of a closure
328 *
329 * This is used to indicate that @cl is finished: when all outstanding refs on
330 * @cl have been dropped @cl's ref on its parent closure (as passed to
331 * closure_init()) will be dropped, if one was specified - thus this can be
332 * thought of as returning to the parent closure.
333 */
501#define closure_return(_cl) continue_at((_cl), NULL, NULL) 334#define closure_return(_cl) continue_at((_cl), NULL, NULL)
502 335
336/**
337 * continue_at_nobarrier - jump to another function without barrier
338 *
339 * Causes @fn to be executed out of @cl, in @wq context (or called directly if
340 * @wq is NULL).
341 *
342 * NOTE: like continue_at(), this macro expands to a return in the caller!
343 *
344 * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
345 * thus it's not safe to touch anything protected by @cl after a
346 * continue_at_nobarrier().
347 */
503#define continue_at_nobarrier(_cl, _fn, _wq) \ 348#define continue_at_nobarrier(_cl, _fn, _wq) \
504do { \ 349do { \
505 set_closure_fn(_cl, _fn, _wq); \ 350 set_closure_fn(_cl, _fn, _wq); \
@@ -507,6 +352,15 @@ do { \
507 return; \ 352 return; \
508} while (0) 353} while (0)
509 354
355/**
356 * closure_return - finish execution of a closure, with destructor
357 *
358 * Works like closure_return(), except @destructor will be called when all
359 * outstanding refs on @cl have been dropped; @destructor may be used to safely
360 * free the memory occupied by @cl, and it is called with the ref on the parent
361 * closure still held - so @destructor could safely return an item to a
362 * freelist protected by @cl's parent.
363 */
510#define closure_return_with_destructor(_cl, _destructor) \ 364#define closure_return_with_destructor(_cl, _destructor) \
511do { \ 365do { \
512 set_closure_fn(_cl, _destructor, NULL); \ 366 set_closure_fn(_cl, _destructor, NULL); \
@@ -514,6 +368,13 @@ do { \
514 return; \ 368 return; \
515} while (0) 369} while (0)
516 370
371/**
372 * closure_call - execute @fn out of a new, uninitialized closure
373 *
374 * Typically used when running out of one closure, and we want to run @fn
375 * asynchronously out of a new closure - @parent will then wait for @cl to
376 * finish.
377 */
517static inline void closure_call(struct closure *cl, closure_fn fn, 378static inline void closure_call(struct closure *cl, closure_fn fn,
518 struct workqueue_struct *wq, 379 struct workqueue_struct *wq,
519 struct closure *parent) 380 struct closure *parent)
@@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
522 continue_at_nobarrier(cl, fn, wq); 383 continue_at_nobarrier(cl, fn, wq);
523} 384}
524 385
525static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
526 struct workqueue_struct *wq,
527 struct closure *parent)
528{
529 if (closure_trylock(cl, parent))
530 continue_at_nobarrier(cl, fn, wq);
531}
532
533#endif /* _LINUX_CLOSURE_H */ 386#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 264fcfbd6290..8b1f1d5c1819 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,6 +8,7 @@
8#include "bcache.h" 8#include "bcache.h"
9#include "btree.h" 9#include "btree.h"
10#include "debug.h" 10#include "debug.h"
11#include "extents.h"
11 12
12#include <linux/console.h> 13#include <linux/console.h>
13#include <linux/debugfs.h> 14#include <linux/debugfs.h>
@@ -17,163 +18,96 @@
17 18
18static struct dentry *debug; 19static struct dentry *debug;
19 20
20const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
21{
22 unsigned i;
23
24 for (i = 0; i < KEY_PTRS(k); i++)
25 if (ptr_available(c, k, i)) {
26 struct cache *ca = PTR_CACHE(c, k, i);
27 size_t bucket = PTR_BUCKET_NR(c, k, i);
28 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
29
30 if (KEY_SIZE(k) + r > c->sb.bucket_size)
31 return "bad, length too big";
32 if (bucket < ca->sb.first_bucket)
33 return "bad, short offset";
34 if (bucket >= ca->sb.nbuckets)
35 return "bad, offset past end of device";
36 if (ptr_stale(c, k, i))
37 return "stale";
38 }
39
40 if (!bkey_cmp(k, &ZERO_KEY))
41 return "bad, null key";
42 if (!KEY_PTRS(k))
43 return "bad, no pointers";
44 if (!KEY_SIZE(k))
45 return "zeroed key";
46 return "";
47}
48
49int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
50{
51 unsigned i = 0;
52 char *out = buf, *end = buf + size;
53
54#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
55
56 p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
57
58 if (KEY_PTRS(k))
59 while (1) {
60 p("%llu:%llu gen %llu",
61 PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
62
63 if (++i == KEY_PTRS(k))
64 break;
65
66 p(", ");
67 }
68
69 p("]");
70
71 if (KEY_DIRTY(k))
72 p(" dirty");
73 if (KEY_CSUM(k))
74 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
75#undef p
76 return out - buf;
77}
78
79#ifdef CONFIG_BCACHE_DEBUG 21#ifdef CONFIG_BCACHE_DEBUG
80 22
81static void dump_bset(struct btree *b, struct bset *i) 23#define for_each_written_bset(b, start, i) \
82{ 24 for (i = (start); \
83 struct bkey *k, *next; 25 (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
84 unsigned j; 26 i->seq == (start)->seq; \
85 char buf[80]; 27 i = (void *) i + set_blocks(i, block_bytes(b->c)) * \
86 28 block_bytes(b->c))
87 for (k = i->start; k < end(i); k = next) {
88 next = bkey_next(k);
89
90 bch_bkey_to_text(buf, sizeof(buf), k);
91 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
92 (uint64_t *) k - i->d, i->keys, buf);
93
94 for (j = 0; j < KEY_PTRS(k); j++) {
95 size_t n = PTR_BUCKET_NR(b->c, k, j);
96 printk(" bucket %zu", n);
97
98 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
99 printk(" prio %i",
100 PTR_BUCKET(b->c, k, j)->prio);
101 }
102 29
103 printk(" %s\n", bch_ptr_status(b->c, k)); 30void bch_btree_verify(struct btree *b)
104
105 if (next < end(i) &&
106 bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
107 printk(KERN_ERR "Key skipped backwards\n");
108 }
109}
110
111static void bch_dump_bucket(struct btree *b)
112{
113 unsigned i;
114
115 console_lock();
116 for (i = 0; i <= b->nsets; i++)
117 dump_bset(b, b->sets[i].data);
118 console_unlock();
119}
120
121void bch_btree_verify(struct btree *b, struct bset *new)
122{ 31{
123 struct btree *v = b->c->verify_data; 32 struct btree *v = b->c->verify_data;
124 struct closure cl; 33 struct bset *ondisk, *sorted, *inmemory;
125 closure_init_stack(&cl); 34 struct bio *bio;
126 35
127 if (!b->c->verify) 36 if (!b->c->verify || !b->c->verify_ondisk)
128 return; 37 return;
129 38
130 closure_wait_event(&b->io.wait, &cl, 39 down(&b->io_mutex);
131 atomic_read(&b->io.cl.remaining) == -1);
132
133 mutex_lock(&b->c->verify_lock); 40 mutex_lock(&b->c->verify_lock);
134 41
42 ondisk = b->c->verify_ondisk;
43 sorted = b->c->verify_data->keys.set->data;
44 inmemory = b->keys.set->data;
45
135 bkey_copy(&v->key, &b->key); 46 bkey_copy(&v->key, &b->key);
136 v->written = 0; 47 v->written = 0;
137 v->level = b->level; 48 v->level = b->level;
49 v->keys.ops = b->keys.ops;
50
51 bio = bch_bbio_alloc(b->c);
52 bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev;
53 bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
54 bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9;
55 bch_bio_map(bio, sorted);
138 56
139 bch_btree_node_read(v); 57 submit_bio_wait(REQ_META|READ_SYNC, bio);
140 closure_wait_event(&v->io.wait, &cl, 58 bch_bbio_free(bio, b->c);
141 atomic_read(&b->io.cl.remaining) == -1);
142 59
143 if (new->keys != v->sets[0].data->keys || 60 memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
144 memcmp(new->start, 61
145 v->sets[0].data->start, 62 bch_btree_node_read_done(v);
146 (void *) end(new) - (void *) new->start)) { 63 sorted = v->keys.set->data;
147 unsigned i, j; 64
65 if (inmemory->keys != sorted->keys ||
66 memcmp(inmemory->start,
67 sorted->start,
68 (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
69 struct bset *i;
70 unsigned j;
148 71
149 console_lock(); 72 console_lock();
150 73
151 printk(KERN_ERR "*** original memory node:\n"); 74 printk(KERN_ERR "*** in memory:\n");
152 for (i = 0; i <= b->nsets; i++) 75 bch_dump_bset(&b->keys, inmemory, 0);
153 dump_bset(b, b->sets[i].data);
154 76
155 printk(KERN_ERR "*** sorted memory node:\n"); 77 printk(KERN_ERR "*** read back in:\n");
156 dump_bset(b, new); 78 bch_dump_bset(&v->keys, sorted, 0);
157 79
158 printk(KERN_ERR "*** on disk node:\n"); 80 for_each_written_bset(b, ondisk, i) {
159 dump_bset(v, v->sets[0].data); 81 unsigned block = ((void *) i - (void *) ondisk) /
82 block_bytes(b->c);
83
84 printk(KERN_ERR "*** on disk block %u:\n", block);
85 bch_dump_bset(&b->keys, i, block);
86 }
160 87
161 for (j = 0; j < new->keys; j++) 88 printk(KERN_ERR "*** block %zu not written\n",
162 if (new->d[j] != v->sets[0].data->d[j]) 89 ((void *) i - (void *) ondisk) / block_bytes(b->c));
90
91 for (j = 0; j < inmemory->keys; j++)
92 if (inmemory->d[j] != sorted->d[j])
163 break; 93 break;
164 94
95 printk(KERN_ERR "b->written %u\n", b->written);
96
165 console_unlock(); 97 console_unlock();
166 panic("verify failed at %u\n", j); 98 panic("verify failed at %u\n", j);
167 } 99 }
168 100
169 mutex_unlock(&b->c->verify_lock); 101 mutex_unlock(&b->c->verify_lock);
102 up(&b->io_mutex);
170} 103}
171 104
172void bch_data_verify(struct cached_dev *dc, struct bio *bio) 105void bch_data_verify(struct cached_dev *dc, struct bio *bio)
173{ 106{
174 char name[BDEVNAME_SIZE]; 107 char name[BDEVNAME_SIZE];
175 struct bio *check; 108 struct bio *check;
176 struct bio_vec *bv; 109 struct bio_vec bv, *bv2;
110 struct bvec_iter iter;
177 int i; 111 int i;
178 112
179 check = bio_clone(bio, GFP_NOIO); 113 check = bio_clone(bio, GFP_NOIO);
@@ -185,95 +119,27 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
185 119
186 submit_bio_wait(READ_SYNC, check); 120 submit_bio_wait(READ_SYNC, check);
187 121
188 bio_for_each_segment(bv, bio, i) { 122 bio_for_each_segment(bv, bio, iter) {
189 void *p1 = kmap_atomic(bv->bv_page); 123 void *p1 = kmap_atomic(bv.bv_page);
190 void *p2 = page_address(check->bi_io_vec[i].bv_page); 124 void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
191 125
192 cache_set_err_on(memcmp(p1 + bv->bv_offset, 126 cache_set_err_on(memcmp(p1 + bv.bv_offset,
193 p2 + bv->bv_offset, 127 p2 + bv.bv_offset,
194 bv->bv_len), 128 bv.bv_len),
195 dc->disk.c, 129 dc->disk.c,
196 "verify failed at dev %s sector %llu", 130 "verify failed at dev %s sector %llu",
197 bdevname(dc->bdev, name), 131 bdevname(dc->bdev, name),
198 (uint64_t) bio->bi_sector); 132 (uint64_t) bio->bi_iter.bi_sector);
199 133
200 kunmap_atomic(p1); 134 kunmap_atomic(p1);
201 } 135 }
202 136
203 bio_for_each_segment_all(bv, check, i) 137 bio_for_each_segment_all(bv2, check, i)
204 __free_page(bv->bv_page); 138 __free_page(bv2->bv_page);
205out_put: 139out_put:
206 bio_put(check); 140 bio_put(check);
207} 141}
208 142
209int __bch_count_data(struct btree *b)
210{
211 unsigned ret = 0;
212 struct btree_iter iter;
213 struct bkey *k;
214
215 if (!b->level)
216 for_each_key(b, k, &iter)
217 ret += KEY_SIZE(k);
218 return ret;
219}
220
221void __bch_check_keys(struct btree *b, const char *fmt, ...)
222{
223 va_list args;
224 struct bkey *k, *p = NULL;
225 struct btree_iter iter;
226 const char *err;
227
228 for_each_key(b, k, &iter) {
229 if (!b->level) {
230 err = "Keys out of order";
231 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
232 goto bug;
233
234 if (bch_ptr_invalid(b, k))
235 continue;
236
237 err = "Overlapping keys";
238 if (p && bkey_cmp(p, &START_KEY(k)) > 0)
239 goto bug;
240 } else {
241 if (bch_ptr_bad(b, k))
242 continue;
243
244 err = "Duplicate keys";
245 if (p && !bkey_cmp(p, k))
246 goto bug;
247 }
248 p = k;
249 }
250
251 err = "Key larger than btree node key";
252 if (p && bkey_cmp(p, &b->key) > 0)
253 goto bug;
254
255 return;
256bug:
257 bch_dump_bucket(b);
258
259 va_start(args, fmt);
260 vprintk(fmt, args);
261 va_end(args);
262
263 panic("bcache error: %s:\n", err);
264}
265
266void bch_btree_iter_next_check(struct btree_iter *iter)
267{
268 struct bkey *k = iter->data->k, *next = bkey_next(k);
269
270 if (next < iter->data->end &&
271 bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
272 bch_dump_bucket(iter->b);
273 panic("Key skipped backwards\n");
274 }
275}
276
277#endif 143#endif
278 144
279#ifdef CONFIG_DEBUG_FS 145#ifdef CONFIG_DEBUG_FS
@@ -320,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
320 if (!w) 186 if (!w)
321 break; 187 break;
322 188
323 bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); 189 bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
324 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); 190 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
325 bch_keybuf_del(&i->keys, w); 191 bch_keybuf_del(&i->keys, w);
326 } 192 }
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 2ede60e31874..1f63c195d247 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -1,47 +1,30 @@
1#ifndef _BCACHE_DEBUG_H 1#ifndef _BCACHE_DEBUG_H
2#define _BCACHE_DEBUG_H 2#define _BCACHE_DEBUG_H
3 3
4/* Btree/bkey debug printing */ 4struct bio;
5 5struct cached_dev;
6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); 6struct cache_set;
7 7
8#ifdef CONFIG_BCACHE_DEBUG 8#ifdef CONFIG_BCACHE_DEBUG
9 9
10void bch_btree_verify(struct btree *, struct bset *); 10void bch_btree_verify(struct btree *);
11void bch_data_verify(struct cached_dev *, struct bio *); 11void bch_data_verify(struct cached_dev *, struct bio *);
12int __bch_count_data(struct btree *);
13void __bch_check_keys(struct btree *, const char *, ...);
14void bch_btree_iter_next_check(struct btree_iter *);
15 12
16#define EBUG_ON(cond) BUG_ON(cond)
17#define expensive_debug_checks(c) ((c)->expensive_debug_checks) 13#define expensive_debug_checks(c) ((c)->expensive_debug_checks)
18#define key_merging_disabled(c) ((c)->key_merging_disabled) 14#define key_merging_disabled(c) ((c)->key_merging_disabled)
19#define bypass_torture_test(d) ((d)->bypass_torture_test) 15#define bypass_torture_test(d) ((d)->bypass_torture_test)
20 16
21#else /* DEBUG */ 17#else /* DEBUG */
22 18
23static inline void bch_btree_verify(struct btree *b, struct bset *i) {} 19static inline void bch_btree_verify(struct btree *b) {}
24static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} 20static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
25static inline int __bch_count_data(struct btree *b) { return -1; }
26static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
27static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
28 21
29#define EBUG_ON(cond) do { if (cond); } while (0)
30#define expensive_debug_checks(c) 0 22#define expensive_debug_checks(c) 0
31#define key_merging_disabled(c) 0 23#define key_merging_disabled(c) 0
32#define bypass_torture_test(d) 0 24#define bypass_torture_test(d) 0
33 25
34#endif 26#endif
35 27
36#define bch_count_data(b) \
37 (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
38
39#define bch_check_keys(b, ...) \
40do { \
41 if (expensive_debug_checks((b)->c)) \
42 __bch_check_keys(b, __VA_ARGS__); \
43} while (0)
44
45#ifdef CONFIG_DEBUG_FS 28#ifdef CONFIG_DEBUG_FS
46void bch_debug_init_cache_set(struct cache_set *); 29void bch_debug_init_cache_set(struct cache_set *);
47#else 30#else
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
new file mode 100644
index 000000000000..416d1a3e028e
--- /dev/null
+++ b/drivers/md/bcache/extents.c
@@ -0,0 +1,616 @@
1/*
2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 *
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
6 * of the device.
7 *
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
12 *
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
15 *
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
19 *
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
21 */
22
23#include "bcache.h"
24#include "btree.h"
25#include "debug.h"
26#include "extents.h"
27#include "writeback.h"
28
29static void sort_key_next(struct btree_iter *iter,
30 struct btree_iter_set *i)
31{
32 i->k = bkey_next(i->k);
33
34 if (i->k == i->end)
35 *i = iter->data[--iter->used];
36}
37
38static bool bch_key_sort_cmp(struct btree_iter_set l,
39 struct btree_iter_set r)
40{
41 int64_t c = bkey_cmp(l.k, r.k);
42
43 return c ? c > 0 : l.k < r.k;
44}
45
46static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
47{
48 unsigned i;
49
50 for (i = 0; i < KEY_PTRS(k); i++)
51 if (ptr_available(c, k, i)) {
52 struct cache *ca = PTR_CACHE(c, k, i);
53 size_t bucket = PTR_BUCKET_NR(c, k, i);
54 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
55
56 if (KEY_SIZE(k) + r > c->sb.bucket_size ||
57 bucket < ca->sb.first_bucket ||
58 bucket >= ca->sb.nbuckets)
59 return true;
60 }
61
62 return false;
63}
64
65/* Common among btree and extent ptrs */
66
67static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
68{
69 unsigned i;
70
71 for (i = 0; i < KEY_PTRS(k); i++)
72 if (ptr_available(c, k, i)) {
73 struct cache *ca = PTR_CACHE(c, k, i);
74 size_t bucket = PTR_BUCKET_NR(c, k, i);
75 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
76
77 if (KEY_SIZE(k) + r > c->sb.bucket_size)
78 return "bad, length too big";
79 if (bucket < ca->sb.first_bucket)
80 return "bad, short offset";
81 if (bucket >= ca->sb.nbuckets)
82 return "bad, offset past end of device";
83 if (ptr_stale(c, k, i))
84 return "stale";
85 }
86
87 if (!bkey_cmp(k, &ZERO_KEY))
88 return "bad, null key";
89 if (!KEY_PTRS(k))
90 return "bad, no pointers";
91 if (!KEY_SIZE(k))
92 return "zeroed key";
93 return "";
94}
95
96void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
97{
98 unsigned i = 0;
99 char *out = buf, *end = buf + size;
100
101#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
102
103 p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
104
105 for (i = 0; i < KEY_PTRS(k); i++) {
106 if (i)
107 p(", ");
108
109 if (PTR_DEV(k, i) == PTR_CHECK_DEV)
110 p("check dev");
111 else
112 p("%llu:%llu gen %llu", PTR_DEV(k, i),
113 PTR_OFFSET(k, i), PTR_GEN(k, i));
114 }
115
116 p("]");
117
118 if (KEY_DIRTY(k))
119 p(" dirty");
120 if (KEY_CSUM(k))
121 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
122#undef p
123}
124
125static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
126{
127 struct btree *b = container_of(keys, struct btree, keys);
128 unsigned j;
129 char buf[80];
130
131 bch_extent_to_text(buf, sizeof(buf), k);
132 printk(" %s", buf);
133
134 for (j = 0; j < KEY_PTRS(k); j++) {
135 size_t n = PTR_BUCKET_NR(b->c, k, j);
136 printk(" bucket %zu", n);
137
138 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
139 printk(" prio %i",
140 PTR_BUCKET(b->c, k, j)->prio);
141 }
142
143 printk(" %s\n", bch_ptr_status(b->c, k));
144}
145
146/* Btree ptrs */
147
148bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
149{
150 char buf[80];
151
152 if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
153 goto bad;
154
155 if (__ptr_invalid(c, k))
156 goto bad;
157
158 return false;
159bad:
160 bch_extent_to_text(buf, sizeof(buf), k);
161 cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
162 return true;
163}
164
165static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
166{
167 struct btree *b = container_of(bk, struct btree, keys);
168 return __bch_btree_ptr_invalid(b->c, k);
169}
170
171static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
172{
173 unsigned i;
174 char buf[80];
175 struct bucket *g;
176
177 if (mutex_trylock(&b->c->bucket_lock)) {
178 for (i = 0; i < KEY_PTRS(k); i++)
179 if (ptr_available(b->c, k, i)) {
180 g = PTR_BUCKET(b->c, k, i);
181
182 if (KEY_DIRTY(k) ||
183 g->prio != BTREE_PRIO ||
184 (b->c->gc_mark_valid &&
185 GC_MARK(g) != GC_MARK_METADATA))
186 goto err;
187 }
188
189 mutex_unlock(&b->c->bucket_lock);
190 }
191
192 return false;
193err:
194 mutex_unlock(&b->c->bucket_lock);
195 bch_extent_to_text(buf, sizeof(buf), k);
196 btree_bug(b,
197"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
198 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
199 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
200 return true;
201}
202
203static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
204{
205 struct btree *b = container_of(bk, struct btree, keys);
206 unsigned i;
207
208 if (!bkey_cmp(k, &ZERO_KEY) ||
209 !KEY_PTRS(k) ||
210 bch_ptr_invalid(bk, k))
211 return true;
212
213 for (i = 0; i < KEY_PTRS(k); i++)
214 if (!ptr_available(b->c, k, i) ||
215 ptr_stale(b->c, k, i))
216 return true;
217
218 if (expensive_debug_checks(b->c) &&
219 btree_ptr_bad_expensive(b, k))
220 return true;
221
222 return false;
223}
224
225static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
226 struct bkey *insert,
227 struct btree_iter *iter,
228 struct bkey *replace_key)
229{
230 struct btree *b = container_of(bk, struct btree, keys);
231
232 if (!KEY_OFFSET(insert))
233 btree_current_write(b)->prio_blocked++;
234
235 return false;
236}
237
238const struct btree_keys_ops bch_btree_keys_ops = {
239 .sort_cmp = bch_key_sort_cmp,
240 .insert_fixup = bch_btree_ptr_insert_fixup,
241 .key_invalid = bch_btree_ptr_invalid,
242 .key_bad = bch_btree_ptr_bad,
243 .key_to_text = bch_extent_to_text,
244 .key_dump = bch_bkey_dump,
245};
246
247/* Extents */
248
249/*
250 * Returns true if l > r - unless l == r, in which case returns true if l is
251 * older than r.
252 *
253 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
254 * equal in different sets, we have to process them newest to oldest.
255 */
256static bool bch_extent_sort_cmp(struct btree_iter_set l,
257 struct btree_iter_set r)
258{
259 int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
260
261 return c ? c > 0 : l.k < r.k;
262}
263
264static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
265 struct bkey *tmp)
266{
267 while (iter->used > 1) {
268 struct btree_iter_set *top = iter->data, *i = top + 1;
269
270 if (iter->used > 2 &&
271 bch_extent_sort_cmp(i[0], i[1]))
272 i++;
273
274 if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
275 break;
276
277 if (!KEY_SIZE(i->k)) {
278 sort_key_next(iter, i);
279 heap_sift(iter, i - top, bch_extent_sort_cmp);
280 continue;
281 }
282
283 if (top->k > i->k) {
284 if (bkey_cmp(top->k, i->k) >= 0)
285 sort_key_next(iter, i);
286 else
287 bch_cut_front(top->k, i->k);
288
289 heap_sift(iter, i - top, bch_extent_sort_cmp);
290 } else {
291 /* can't happen because of comparison func */
292 BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
293
294 if (bkey_cmp(i->k, top->k) < 0) {
295 bkey_copy(tmp, top->k);
296
297 bch_cut_back(&START_KEY(i->k), tmp);
298 bch_cut_front(i->k, top->k);
299 heap_sift(iter, 0, bch_extent_sort_cmp);
300
301 return tmp;
302 } else {
303 bch_cut_back(&START_KEY(i->k), top->k);
304 }
305 }
306 }
307
308 return NULL;
309}
310
311static bool bch_extent_insert_fixup(struct btree_keys *b,
312 struct bkey *insert,
313 struct btree_iter *iter,
314 struct bkey *replace_key)
315{
316 struct cache_set *c = container_of(b, struct btree, keys)->c;
317
318 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
319 {
320 if (KEY_DIRTY(k))
321 bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
322 offset, -sectors);
323 }
324
325 uint64_t old_offset;
326 unsigned old_size, sectors_found = 0;
327
328 BUG_ON(!KEY_OFFSET(insert));
329 BUG_ON(!KEY_SIZE(insert));
330
331 while (1) {
332 struct bkey *k = bch_btree_iter_next(iter);
333 if (!k)
334 break;
335
336 if (bkey_cmp(&START_KEY(k), insert) >= 0) {
337 if (KEY_SIZE(k))
338 break;
339 else
340 continue;
341 }
342
343 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
344 continue;
345
346 old_offset = KEY_START(k);
347 old_size = KEY_SIZE(k);
348
349 /*
350 * We might overlap with 0 size extents; we can't skip these
351 * because if they're in the set we're inserting to we have to
352 * adjust them so they don't overlap with the key we're
353 * inserting. But we don't want to check them for replace
354 * operations.
355 */
356
357 if (replace_key && KEY_SIZE(k)) {
358 /*
359 * k might have been split since we inserted/found the
360 * key we're replacing
361 */
362 unsigned i;
363 uint64_t offset = KEY_START(k) -
364 KEY_START(replace_key);
365
366 /* But it must be a subset of the replace key */
367 if (KEY_START(k) < KEY_START(replace_key) ||
368 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
369 goto check_failed;
370
371 /* We didn't find a key that we were supposed to */
372 if (KEY_START(k) > KEY_START(insert) + sectors_found)
373 goto check_failed;
374
375 if (!bch_bkey_equal_header(k, replace_key))
376 goto check_failed;
377
378 /* skip past gen */
379 offset <<= 8;
380
381 BUG_ON(!KEY_PTRS(replace_key));
382
383 for (i = 0; i < KEY_PTRS(replace_key); i++)
384 if (k->ptr[i] != replace_key->ptr[i] + offset)
385 goto check_failed;
386
387 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
388 }
389
390 if (bkey_cmp(insert, k) < 0 &&
391 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
392 /*
393 * We overlapped in the middle of an existing key: that
394 * means we have to split the old key. But we have to do
395 * slightly different things depending on whether the
396 * old key has been written out yet.
397 */
398
399 struct bkey *top;
400
401 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
402
403 if (bkey_written(b, k)) {
404 /*
405 * We insert a new key to cover the top of the
406 * old key, and the old key is modified in place
407 * to represent the bottom split.
408 *
409 * It's completely arbitrary whether the new key
410 * is the top or the bottom, but it has to match
411 * up with what btree_sort_fixup() does - it
412 * doesn't check for this kind of overlap, it
413 * depends on us inserting a new key for the top
414 * here.
415 */
416 top = bch_bset_search(b, bset_tree_last(b),
417 insert);
418 bch_bset_insert(b, top, k);
419 } else {
420 BKEY_PADDED(key) temp;
421 bkey_copy(&temp.key, k);
422 bch_bset_insert(b, k, &temp.key);
423 top = bkey_next(k);
424 }
425
426 bch_cut_front(insert, top);
427 bch_cut_back(&START_KEY(insert), k);
428 bch_bset_fix_invalidated_key(b, k);
429 goto out;
430 }
431
432 if (bkey_cmp(insert, k) < 0) {
433 bch_cut_front(insert, k);
434 } else {
435 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
436 old_offset = KEY_START(insert);
437
438 if (bkey_written(b, k) &&
439 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
440 /*
441 * Completely overwrote, so we don't have to
442 * invalidate the binary search tree
443 */
444 bch_cut_front(k, k);
445 } else {
446 __bch_cut_back(&START_KEY(insert), k);
447 bch_bset_fix_invalidated_key(b, k);
448 }
449 }
450
451 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
452 }
453
454check_failed:
455 if (replace_key) {
456 if (!sectors_found) {
457 return true;
458 } else if (sectors_found < KEY_SIZE(insert)) {
459 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
460 (KEY_SIZE(insert) - sectors_found));
461 SET_KEY_SIZE(insert, sectors_found);
462 }
463 }
464out:
465 if (KEY_DIRTY(insert))
466 bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
467 KEY_START(insert),
468 KEY_SIZE(insert));
469
470 return false;
471}
472
473static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
474{
475 struct btree *b = container_of(bk, struct btree, keys);
476 char buf[80];
477
478 if (!KEY_SIZE(k))
479 return true;
480
481 if (KEY_SIZE(k) > KEY_OFFSET(k))
482 goto bad;
483
484 if (__ptr_invalid(b->c, k))
485 goto bad;
486
487 return false;
488bad:
489 bch_extent_to_text(buf, sizeof(buf), k);
490 cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
491 return true;
492}
493
494static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
495 unsigned ptr)
496{
497 struct bucket *g = PTR_BUCKET(b->c, k, ptr);
498 char buf[80];
499
500 if (mutex_trylock(&b->c->bucket_lock)) {
501 if (b->c->gc_mark_valid &&
502 ((GC_MARK(g) != GC_MARK_DIRTY &&
503 KEY_DIRTY(k)) ||
504 GC_MARK(g) == GC_MARK_METADATA))
505 goto err;
506
507 if (g->prio == BTREE_PRIO)
508 goto err;
509
510 mutex_unlock(&b->c->bucket_lock);
511 }
512
513 return false;
514err:
515 mutex_unlock(&b->c->bucket_lock);
516 bch_extent_to_text(buf, sizeof(buf), k);
517 btree_bug(b,
518"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
519 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
520 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
521 return true;
522}
523
524static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
525{
526 struct btree *b = container_of(bk, struct btree, keys);
527 struct bucket *g;
528 unsigned i, stale;
529
530 if (!KEY_PTRS(k) ||
531 bch_extent_invalid(bk, k))
532 return true;
533
534 for (i = 0; i < KEY_PTRS(k); i++)
535 if (!ptr_available(b->c, k, i))
536 return true;
537
538 if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
539 return false;
540
541 for (i = 0; i < KEY_PTRS(k); i++) {
542 g = PTR_BUCKET(b->c, k, i);
543 stale = ptr_stale(b->c, k, i);
544
545 btree_bug_on(stale > 96, b,
546 "key too stale: %i, need_gc %u",
547 stale, b->c->need_gc);
548
549 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
550 b, "stale dirty pointer");
551
552 if (stale)
553 return true;
554
555 if (expensive_debug_checks(b->c) &&
556 bch_extent_bad_expensive(b, k, i))
557 return true;
558 }
559
560 return false;
561}
562
563static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
564{
565 return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
566 ~((uint64_t)1 << 63);
567}
568
569static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
570{
571 struct btree *b = container_of(bk, struct btree, keys);
572 unsigned i;
573
574 if (key_merging_disabled(b->c))
575 return false;
576
577 for (i = 0; i < KEY_PTRS(l); i++)
578 if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
579 PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
580 return false;
581
582 /* Keys with no pointers aren't restricted to one bucket and could
583 * overflow KEY_SIZE
584 */
585 if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
586 SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
587 SET_KEY_SIZE(l, USHRT_MAX);
588
589 bch_cut_front(l, r);
590 return false;
591 }
592
593 if (KEY_CSUM(l)) {
594 if (KEY_CSUM(r))
595 l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
596 else
597 SET_KEY_CSUM(l, 0);
598 }
599
600 SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
601 SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
602
603 return true;
604}
605
606const struct btree_keys_ops bch_extent_keys_ops = {
607 .sort_cmp = bch_extent_sort_cmp,
608 .sort_fixup = bch_extent_sort_fixup,
609 .insert_fixup = bch_extent_insert_fixup,
610 .key_invalid = bch_extent_invalid,
611 .key_bad = bch_extent_bad,
612 .key_merge = bch_extent_merge,
613 .key_to_text = bch_extent_to_text,
614 .key_dump = bch_bkey_dump,
615 .is_extents = true,
616};
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
new file mode 100644
index 000000000000..e4e23409782d
--- /dev/null
+++ b/drivers/md/bcache/extents.h
@@ -0,0 +1,13 @@
1#ifndef _BCACHE_EXTENTS_H
2#define _BCACHE_EXTENTS_H
3
4extern const struct btree_keys_ops bch_btree_keys_ops;
5extern const struct btree_keys_ops bch_extent_keys_ops;
6
7struct bkey;
8struct cache_set;
9
10void bch_extent_to_text(char *, size_t, const struct bkey *);
11bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
12
13#endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 9056632995b1..fa028fa82df4 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,178 +11,40 @@
11 11
12#include <linux/blkdev.h> 12#include <linux/blkdev.h>
13 13
14static void bch_bi_idx_hack_endio(struct bio *bio, int error)
15{
16 struct bio *p = bio->bi_private;
17
18 bio_endio(p, error);
19 bio_put(bio);
20}
21
22static void bch_generic_make_request_hack(struct bio *bio)
23{
24 if (bio->bi_idx) {
25 struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
26
27 memcpy(clone->bi_io_vec,
28 bio_iovec(bio),
29 bio_segments(bio) * sizeof(struct bio_vec));
30
31 clone->bi_sector = bio->bi_sector;
32 clone->bi_bdev = bio->bi_bdev;
33 clone->bi_rw = bio->bi_rw;
34 clone->bi_vcnt = bio_segments(bio);
35 clone->bi_size = bio->bi_size;
36
37 clone->bi_private = bio;
38 clone->bi_end_io = bch_bi_idx_hack_endio;
39
40 bio = clone;
41 }
42
43 /*
44 * Hack, since drivers that clone bios clone up to bi_max_vecs, but our
45 * bios might have had more than that (before we split them per device
46 * limitations).
47 *
48 * To be taken out once immutable bvec stuff is in.
49 */
50 bio->bi_max_vecs = bio->bi_vcnt;
51
52 generic_make_request(bio);
53}
54
55/**
56 * bch_bio_split - split a bio
57 * @bio: bio to split
58 * @sectors: number of sectors to split from the front of @bio
59 * @gfp: gfp mask
60 * @bs: bio set to allocate from
61 *
62 * Allocates and returns a new bio which represents @sectors from the start of
63 * @bio, and updates @bio to represent the remaining sectors.
64 *
65 * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
66 * unchanged.
67 *
68 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
69 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
70 * freed before the split.
71 */
72struct bio *bch_bio_split(struct bio *bio, int sectors,
73 gfp_t gfp, struct bio_set *bs)
74{
75 unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
76 struct bio_vec *bv;
77 struct bio *ret = NULL;
78
79 BUG_ON(sectors <= 0);
80
81 if (sectors >= bio_sectors(bio))
82 return bio;
83
84 if (bio->bi_rw & REQ_DISCARD) {
85 ret = bio_alloc_bioset(gfp, 1, bs);
86 if (!ret)
87 return NULL;
88 idx = 0;
89 goto out;
90 }
91
92 bio_for_each_segment(bv, bio, idx) {
93 vcnt = idx - bio->bi_idx;
94
95 if (!nbytes) {
96 ret = bio_alloc_bioset(gfp, vcnt, bs);
97 if (!ret)
98 return NULL;
99
100 memcpy(ret->bi_io_vec, bio_iovec(bio),
101 sizeof(struct bio_vec) * vcnt);
102
103 break;
104 } else if (nbytes < bv->bv_len) {
105 ret = bio_alloc_bioset(gfp, ++vcnt, bs);
106 if (!ret)
107 return NULL;
108
109 memcpy(ret->bi_io_vec, bio_iovec(bio),
110 sizeof(struct bio_vec) * vcnt);
111
112 ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
113 bv->bv_offset += nbytes;
114 bv->bv_len -= nbytes;
115 break;
116 }
117
118 nbytes -= bv->bv_len;
119 }
120out:
121 ret->bi_bdev = bio->bi_bdev;
122 ret->bi_sector = bio->bi_sector;
123 ret->bi_size = sectors << 9;
124 ret->bi_rw = bio->bi_rw;
125 ret->bi_vcnt = vcnt;
126 ret->bi_max_vecs = vcnt;
127
128 bio->bi_sector += sectors;
129 bio->bi_size -= sectors << 9;
130 bio->bi_idx = idx;
131
132 if (bio_integrity(bio)) {
133 if (bio_integrity_clone(ret, bio, gfp)) {
134 bio_put(ret);
135 return NULL;
136 }
137
138 bio_integrity_trim(ret, 0, bio_sectors(ret));
139 bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
140 }
141
142 return ret;
143}
144
145static unsigned bch_bio_max_sectors(struct bio *bio) 14static unsigned bch_bio_max_sectors(struct bio *bio)
146{ 15{
147 unsigned ret = bio_sectors(bio);
148 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 16 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
149 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, 17 struct bio_vec bv;
150 queue_max_segments(q)); 18 struct bvec_iter iter;
19 unsigned ret = 0, seg = 0;
151 20
152 if (bio->bi_rw & REQ_DISCARD) 21 if (bio->bi_rw & REQ_DISCARD)
153 return min(ret, q->limits.max_discard_sectors); 22 return min(bio_sectors(bio), q->limits.max_discard_sectors);
154 23
155 if (bio_segments(bio) > max_segments || 24 bio_for_each_segment(bv, bio, iter) {
156 q->merge_bvec_fn) { 25 struct bvec_merge_data bvm = {
157 struct bio_vec *bv; 26 .bi_bdev = bio->bi_bdev,
158 int i, seg = 0; 27 .bi_sector = bio->bi_iter.bi_sector,
159 28 .bi_size = ret << 9,
160 ret = 0; 29 .bi_rw = bio->bi_rw,
161 30 };
162 bio_for_each_segment(bv, bio, i) { 31
163 struct bvec_merge_data bvm = { 32 if (seg == min_t(unsigned, BIO_MAX_PAGES,
164 .bi_bdev = bio->bi_bdev, 33 queue_max_segments(q)))
165 .bi_sector = bio->bi_sector, 34 break;
166 .bi_size = ret << 9,
167 .bi_rw = bio->bi_rw,
168 };
169
170 if (seg == max_segments)
171 break;
172 35
173 if (q->merge_bvec_fn && 36 if (q->merge_bvec_fn &&
174 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) 37 q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
175 break; 38 break;
176 39
177 seg++; 40 seg++;
178 ret += bv->bv_len >> 9; 41 ret += bv.bv_len >> 9;
179 }
180 } 42 }
181 43
182 ret = min(ret, queue_max_sectors(q)); 44 ret = min(ret, queue_max_sectors(q));
183 45
184 WARN_ON(!ret); 46 WARN_ON(!ret);
185 ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); 47 ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9);
186 48
187 return ret; 49 return ret;
188} 50}
@@ -193,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
193 55
194 s->bio->bi_end_io = s->bi_end_io; 56 s->bio->bi_end_io = s->bi_end_io;
195 s->bio->bi_private = s->bi_private; 57 s->bio->bi_private = s->bi_private;
196 bio_endio(s->bio, 0); 58 bio_endio_nodec(s->bio, 0);
197 59
198 closure_debug_destroy(&s->cl); 60 closure_debug_destroy(&s->cl);
199 mempool_free(s, s->p->bio_split_hook); 61 mempool_free(s, s->p->bio_split_hook);
@@ -232,19 +94,19 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
232 bio_get(bio); 94 bio_get(bio);
233 95
234 do { 96 do {
235 n = bch_bio_split(bio, bch_bio_max_sectors(bio), 97 n = bio_next_split(bio, bch_bio_max_sectors(bio),
236 GFP_NOIO, s->p->bio_split); 98 GFP_NOIO, s->p->bio_split);
237 99
238 n->bi_end_io = bch_bio_submit_split_endio; 100 n->bi_end_io = bch_bio_submit_split_endio;
239 n->bi_private = &s->cl; 101 n->bi_private = &s->cl;
240 102
241 closure_get(&s->cl); 103 closure_get(&s->cl);
242 bch_generic_make_request_hack(n); 104 generic_make_request(n);
243 } while (n != bio); 105 } while (n != bio);
244 106
245 continue_at(&s->cl, bch_bio_submit_split_done, NULL); 107 continue_at(&s->cl, bch_bio_submit_split_done, NULL);
246submit: 108submit:
247 bch_generic_make_request_hack(bio); 109 generic_make_request(bio);
248} 110}
249 111
250/* Bios with headers */ 112/* Bios with headers */
@@ -272,8 +134,8 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
272{ 134{
273 struct bbio *b = container_of(bio, struct bbio, bio); 135 struct bbio *b = container_of(bio, struct bbio, bio);
274 136
275 bio->bi_sector = PTR_OFFSET(&b->key, 0); 137 bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
276 bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; 138 bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev;
277 139
278 b->submit_time_us = local_clock_us(); 140 b->submit_time_us = local_clock_us();
279 closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); 141 closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ecdaa671bd50..18039affc306 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -44,17 +44,17 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
44 44
45 closure_init_stack(&cl); 45 closure_init_stack(&cl);
46 46
47 pr_debug("reading %llu", (uint64_t) bucket); 47 pr_debug("reading %u", bucket_index);
48 48
49 while (offset < ca->sb.bucket_size) { 49 while (offset < ca->sb.bucket_size) {
50reread: left = ca->sb.bucket_size - offset; 50reread: left = ca->sb.bucket_size - offset;
51 len = min_t(unsigned, left, PAGE_SECTORS * 8); 51 len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
52 52
53 bio_reset(bio); 53 bio_reset(bio);
54 bio->bi_sector = bucket + offset; 54 bio->bi_iter.bi_sector = bucket + offset;
55 bio->bi_bdev = ca->bdev; 55 bio->bi_bdev = ca->bdev;
56 bio->bi_rw = READ; 56 bio->bi_rw = READ;
57 bio->bi_size = len << 9; 57 bio->bi_iter.bi_size = len << 9;
58 58
59 bio->bi_end_io = journal_read_endio; 59 bio->bi_end_io = journal_read_endio;
60 bio->bi_private = &cl; 60 bio->bi_private = &cl;
@@ -74,19 +74,28 @@ reread: left = ca->sb.bucket_size - offset;
74 struct list_head *where; 74 struct list_head *where;
75 size_t blocks, bytes = set_bytes(j); 75 size_t blocks, bytes = set_bytes(j);
76 76
77 if (j->magic != jset_magic(&ca->sb)) 77 if (j->magic != jset_magic(&ca->sb)) {
78 pr_debug("%u: bad magic", bucket_index);
78 return ret; 79 return ret;
80 }
79 81
80 if (bytes > left << 9) 82 if (bytes > left << 9 ||
83 bytes > PAGE_SIZE << JSET_BITS) {
84 pr_info("%u: too big, %zu bytes, offset %u",
85 bucket_index, bytes, offset);
81 return ret; 86 return ret;
87 }
82 88
83 if (bytes > len << 9) 89 if (bytes > len << 9)
84 goto reread; 90 goto reread;
85 91
86 if (j->csum != csum_set(j)) 92 if (j->csum != csum_set(j)) {
93 pr_info("%u: bad csum, %zu bytes, offset %u",
94 bucket_index, bytes, offset);
87 return ret; 95 return ret;
96 }
88 97
89 blocks = set_blocks(j, ca->set); 98 blocks = set_blocks(j, block_bytes(ca->set));
90 99
91 while (!list_empty(list)) { 100 while (!list_empty(list)) {
92 i = list_first_entry(list, 101 i = list_first_entry(list,
@@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
275 } 284 }
276 285
277 for (k = i->j.start; 286 for (k = i->j.start;
278 k < end(&i->j); 287 k < bset_bkey_last(&i->j);
279 k = bkey_next(k)) { 288 k = bkey_next(k)) {
280 unsigned j; 289 unsigned j;
281 290
@@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
313 n, i->j.seq - 1, start, end); 322 n, i->j.seq - 1, start, end);
314 323
315 for (k = i->j.start; 324 for (k = i->j.start;
316 k < end(&i->j); 325 k < bset_bkey_last(&i->j);
317 k = bkey_next(k)) { 326 k = bkey_next(k)) {
318 trace_bcache_journal_replay_key(k); 327 trace_bcache_journal_replay_key(k);
319 328
@@ -437,13 +446,13 @@ static void do_journal_discard(struct cache *ca)
437 atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); 446 atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
438 447
439 bio_init(bio); 448 bio_init(bio);
440 bio->bi_sector = bucket_to_sector(ca->set, 449 bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
441 ca->sb.d[ja->discard_idx]); 450 ca->sb.d[ja->discard_idx]);
442 bio->bi_bdev = ca->bdev; 451 bio->bi_bdev = ca->bdev;
443 bio->bi_rw = REQ_WRITE|REQ_DISCARD; 452 bio->bi_rw = REQ_WRITE|REQ_DISCARD;
444 bio->bi_max_vecs = 1; 453 bio->bi_max_vecs = 1;
445 bio->bi_io_vec = bio->bi_inline_vecs; 454 bio->bi_io_vec = bio->bi_inline_vecs;
446 bio->bi_size = bucket_bytes(ca); 455 bio->bi_iter.bi_size = bucket_bytes(ca);
447 bio->bi_end_io = journal_discard_endio; 456 bio->bi_end_io = journal_discard_endio;
448 457
449 closure_get(&ca->set->cl); 458 closure_get(&ca->set->cl);
@@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl)
555 continue_at_nobarrier(cl, journal_write, system_wq); 564 continue_at_nobarrier(cl, journal_write, system_wq);
556} 565}
557 566
567static void journal_write_unlock(struct closure *cl)
568{
569 struct cache_set *c = container_of(cl, struct cache_set, journal.io);
570
571 c->journal.io_in_flight = 0;
572 spin_unlock(&c->journal.lock);
573}
574
558static void journal_write_unlocked(struct closure *cl) 575static void journal_write_unlocked(struct closure *cl)
559 __releases(c->journal.lock) 576 __releases(c->journal.lock)
560{ 577{
@@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl)
562 struct cache *ca; 579 struct cache *ca;
563 struct journal_write *w = c->journal.cur; 580 struct journal_write *w = c->journal.cur;
564 struct bkey *k = &c->journal.key; 581 struct bkey *k = &c->journal.key;
565 unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; 582 unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
583 c->sb.block_size;
566 584
567 struct bio *bio; 585 struct bio *bio;
568 struct bio_list list; 586 struct bio_list list;
569 bio_list_init(&list); 587 bio_list_init(&list);
570 588
571 if (!w->need_write) { 589 if (!w->need_write) {
572 /* 590 closure_return_with_destructor(cl, journal_write_unlock);
573 * XXX: have to unlock closure before we unlock journal lock,
574 * else we race with bch_journal(). But this way we race
575 * against cache set unregister. Doh.
576 */
577 set_closure_fn(cl, NULL, NULL);
578 closure_sub(cl, CLOSURE_RUNNING + 1);
579 spin_unlock(&c->journal.lock);
580 return;
581 } else if (journal_full(&c->journal)) { 591 } else if (journal_full(&c->journal)) {
582 journal_reclaim(c); 592 journal_reclaim(c);
583 spin_unlock(&c->journal.lock); 593 spin_unlock(&c->journal.lock);
@@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
586 continue_at(cl, journal_write, system_wq); 596 continue_at(cl, journal_write, system_wq);
587 } 597 }
588 598
589 c->journal.blocks_free -= set_blocks(w->data, c); 599 c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
590 600
591 w->data->btree_level = c->root->level; 601 w->data->btree_level = c->root->level;
592 602
@@ -608,10 +618,10 @@ static void journal_write_unlocked(struct closure *cl)
608 atomic_long_add(sectors, &ca->meta_sectors_written); 618 atomic_long_add(sectors, &ca->meta_sectors_written);
609 619
610 bio_reset(bio); 620 bio_reset(bio);
611 bio->bi_sector = PTR_OFFSET(k, i); 621 bio->bi_iter.bi_sector = PTR_OFFSET(k, i);
612 bio->bi_bdev = ca->bdev; 622 bio->bi_bdev = ca->bdev;
613 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; 623 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
614 bio->bi_size = sectors << 9; 624 bio->bi_iter.bi_size = sectors << 9;
615 625
616 bio->bi_end_io = journal_write_endio; 626 bio->bi_end_io = journal_write_endio;
617 bio->bi_private = w; 627 bio->bi_private = w;
@@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c)
653 663
654 w->need_write = true; 664 w->need_write = true;
655 665
656 if (closure_trylock(cl, &c->cl)) 666 if (!c->journal.io_in_flight) {
657 journal_write_unlocked(cl); 667 c->journal.io_in_flight = 1;
658 else 668 closure_call(cl, journal_write_unlocked, NULL, &c->cl);
669 } else {
659 spin_unlock(&c->journal.lock); 670 spin_unlock(&c->journal.lock);
671 }
660} 672}
661 673
662static struct journal_write *journal_wait_for_write(struct cache_set *c, 674static struct journal_write *journal_wait_for_write(struct cache_set *c,
@@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
664{ 676{
665 size_t sectors; 677 size_t sectors;
666 struct closure cl; 678 struct closure cl;
679 bool wait = false;
667 680
668 closure_init_stack(&cl); 681 closure_init_stack(&cl);
669 682
@@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
673 struct journal_write *w = c->journal.cur; 686 struct journal_write *w = c->journal.cur;
674 687
675 sectors = __set_blocks(w->data, w->data->keys + nkeys, 688 sectors = __set_blocks(w->data, w->data->keys + nkeys,
676 c) * c->sb.block_size; 689 block_bytes(c)) * c->sb.block_size;
677 690
678 if (sectors <= min_t(size_t, 691 if (sectors <= min_t(size_t,
679 c->journal.blocks_free * c->sb.block_size, 692 c->journal.blocks_free * c->sb.block_size,
680 PAGE_SECTORS << JSET_BITS)) 693 PAGE_SECTORS << JSET_BITS))
681 return w; 694 return w;
682 695
683 /* XXX: tracepoint */ 696 if (wait)
697 closure_wait(&c->journal.wait, &cl);
698
684 if (!journal_full(&c->journal)) { 699 if (!journal_full(&c->journal)) {
685 trace_bcache_journal_entry_full(c); 700 if (wait)
701 trace_bcache_journal_entry_full(c);
686 702
687 /* 703 /*
688 * XXX: If we were inserting so many keys that they 704 * XXX: If we were inserting so many keys that they
@@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
692 */ 708 */
693 BUG_ON(!w->data->keys); 709 BUG_ON(!w->data->keys);
694 710
695 closure_wait(&w->wait, &cl);
696 journal_try_write(c); /* unlocks */ 711 journal_try_write(c); /* unlocks */
697 } else { 712 } else {
698 trace_bcache_journal_full(c); 713 if (wait)
714 trace_bcache_journal_full(c);
699 715
700 closure_wait(&c->journal.wait, &cl);
701 journal_reclaim(c); 716 journal_reclaim(c);
702 spin_unlock(&c->journal.lock); 717 spin_unlock(&c->journal.lock);
703 718
@@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
706 721
707 closure_sync(&cl); 722 closure_sync(&cl);
708 spin_lock(&c->journal.lock); 723 spin_lock(&c->journal.lock);
724 wait = true;
709 } 725 }
710} 726}
711 727
@@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c,
736 752
737 w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); 753 w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
738 754
739 memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); 755 memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
740 w->data->keys += bch_keylist_nkeys(keys); 756 w->data->keys += bch_keylist_nkeys(keys);
741 757
742 ret = &fifo_back(&c->journal.pin); 758 ret = &fifo_back(&c->journal.pin);
@@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c)
780{ 796{
781 struct journal *j = &c->journal; 797 struct journal *j = &c->journal;
782 798
783 closure_init_unlocked(&j->io);
784 spin_lock_init(&j->lock); 799 spin_lock_init(&j->lock);
785 INIT_DELAYED_WORK(&j->work, journal_write_work); 800 INIT_DELAYED_WORK(&j->work, journal_write_work);
786 801
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index a6472fda94b2..9180c4465075 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -104,6 +104,7 @@ struct journal {
104 /* used when waiting because the journal was full */ 104 /* used when waiting because the journal was full */
105 struct closure_waitlist wait; 105 struct closure_waitlist wait;
106 struct closure io; 106 struct closure io;
107 int io_in_flight;
107 struct delayed_work work; 108 struct delayed_work work;
108 109
109 /* Number of blocks free in the bucket(s) we're currently writing to */ 110 /* Number of blocks free in the bucket(s) we're currently writing to */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index f2f0998c4a91..9eb60d102de8 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -86,7 +86,7 @@ static void moving_init(struct moving_io *io)
86 bio_get(bio); 86 bio_get(bio);
87 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 87 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
88 88
89 bio->bi_size = KEY_SIZE(&io->w->key) << 9; 89 bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9;
90 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), 90 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
91 PAGE_SECTORS); 91 PAGE_SECTORS);
92 bio->bi_private = &io->cl; 92 bio->bi_private = &io->cl;
@@ -102,7 +102,7 @@ static void write_moving(struct closure *cl)
102 if (!op->error) { 102 if (!op->error) {
103 moving_init(io); 103 moving_init(io);
104 104
105 io->bio.bio.bi_sector = KEY_START(&io->w->key); 105 io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
106 op->write_prio = 1; 106 op->write_prio = 1;
107 op->bio = &io->bio.bio; 107 op->bio = &io->bio.bio;
108 108
@@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c)
211 for_each_cache(ca, c, i) { 211 for_each_cache(ca, c, i) {
212 unsigned sectors_to_move = 0; 212 unsigned sectors_to_move = 0;
213 unsigned reserve_sectors = ca->sb.bucket_size * 213 unsigned reserve_sectors = ca->sb.bucket_size *
214 min(fifo_used(&ca->free), ca->free.size / 2); 214 fifo_used(&ca->free[RESERVE_MOVINGGC]);
215 215
216 ca->heap.used = 0; 216 ca->heap.used = 0;
217 217
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index fbcc851ed5a5..5d5d031cf381 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
163static void bcachecg_destroy(struct cgroup *cgroup) 163static void bcachecg_destroy(struct cgroup *cgroup)
164{ 164{
165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup); 165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
166 free_css_id(&bcache_subsys, &cg->css);
167 kfree(cg); 166 kfree(cg);
168} 167}
169 168
@@ -198,14 +197,14 @@ static bool verify(struct cached_dev *dc, struct bio *bio)
198 197
199static void bio_csum(struct bio *bio, struct bkey *k) 198static void bio_csum(struct bio *bio, struct bkey *k)
200{ 199{
201 struct bio_vec *bv; 200 struct bio_vec bv;
201 struct bvec_iter iter;
202 uint64_t csum = 0; 202 uint64_t csum = 0;
203 int i;
204 203
205 bio_for_each_segment(bv, bio, i) { 204 bio_for_each_segment(bv, bio, iter) {
206 void *d = kmap(bv->bv_page) + bv->bv_offset; 205 void *d = kmap(bv.bv_page) + bv.bv_offset;
207 csum = bch_crc64_update(csum, d, bv->bv_len); 206 csum = bch_crc64_update(csum, d, bv.bv_len);
208 kunmap(bv->bv_page); 207 kunmap(bv.bv_page);
209 } 208 }
210 209
211 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); 210 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
@@ -255,26 +254,44 @@ static void bch_data_insert_keys(struct closure *cl)
255 closure_return(cl); 254 closure_return(cl);
256} 255}
257 256
257static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
258 struct cache_set *c)
259{
260 size_t oldsize = bch_keylist_nkeys(l);
261 size_t newsize = oldsize + u64s;
262
263 /*
264 * The journalling code doesn't handle the case where the keys to insert
265 * is bigger than an empty write: If we just return -ENOMEM here,
266 * bio_insert() and bio_invalidate() will insert the keys created so far
267 * and finish the rest when the keylist is empty.
268 */
269 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
270 return -ENOMEM;
271
272 return __bch_keylist_realloc(l, u64s);
273}
274
258static void bch_data_invalidate(struct closure *cl) 275static void bch_data_invalidate(struct closure *cl)
259{ 276{
260 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 277 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
261 struct bio *bio = op->bio; 278 struct bio *bio = op->bio;
262 279
263 pr_debug("invalidating %i sectors from %llu", 280 pr_debug("invalidating %i sectors from %llu",
264 bio_sectors(bio), (uint64_t) bio->bi_sector); 281 bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
265 282
266 while (bio_sectors(bio)) { 283 while (bio_sectors(bio)) {
267 unsigned sectors = min(bio_sectors(bio), 284 unsigned sectors = min(bio_sectors(bio),
268 1U << (KEY_SIZE_BITS - 1)); 285 1U << (KEY_SIZE_BITS - 1));
269 286
270 if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) 287 if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
271 goto out; 288 goto out;
272 289
273 bio->bi_sector += sectors; 290 bio->bi_iter.bi_sector += sectors;
274 bio->bi_size -= sectors << 9; 291 bio->bi_iter.bi_size -= sectors << 9;
275 292
276 bch_keylist_add(&op->insert_keys, 293 bch_keylist_add(&op->insert_keys,
277 &KEY(op->inode, bio->bi_sector, sectors)); 294 &KEY(op->inode, bio->bi_iter.bi_sector, sectors));
278 } 295 }
279 296
280 op->insert_data_done = true; 297 op->insert_data_done = true;
@@ -336,14 +353,14 @@ static void bch_data_insert_start(struct closure *cl)
336 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 353 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
337 struct bio *bio = op->bio, *n; 354 struct bio *bio = op->bio, *n;
338 355
339 if (op->bypass)
340 return bch_data_invalidate(cl);
341
342 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 356 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
343 set_gc_sectors(op->c); 357 set_gc_sectors(op->c);
344 wake_up_gc(op->c); 358 wake_up_gc(op->c);
345 } 359 }
346 360
361 if (op->bypass)
362 return bch_data_invalidate(cl);
363
347 /* 364 /*
348 * Journal writes are marked REQ_FLUSH; if the original write was a 365 * Journal writes are marked REQ_FLUSH; if the original write was a
349 * flush, it'll wait on the journal write. 366 * flush, it'll wait on the journal write.
@@ -357,21 +374,21 @@ static void bch_data_insert_start(struct closure *cl)
357 374
358 /* 1 for the device pointer and 1 for the chksum */ 375 /* 1 for the device pointer and 1 for the chksum */
359 if (bch_keylist_realloc(&op->insert_keys, 376 if (bch_keylist_realloc(&op->insert_keys,
360 1 + (op->csum ? 1 : 0), 377 3 + (op->csum ? 1 : 0),
361 op->c)) 378 op->c))
362 continue_at(cl, bch_data_insert_keys, bcache_wq); 379 continue_at(cl, bch_data_insert_keys, bcache_wq);
363 380
364 k = op->insert_keys.top; 381 k = op->insert_keys.top;
365 bkey_init(k); 382 bkey_init(k);
366 SET_KEY_INODE(k, op->inode); 383 SET_KEY_INODE(k, op->inode);
367 SET_KEY_OFFSET(k, bio->bi_sector); 384 SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
368 385
369 if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), 386 if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
370 op->write_point, op->write_prio, 387 op->write_point, op->write_prio,
371 op->writeback)) 388 op->writeback))
372 goto err; 389 goto err;
373 390
374 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); 391 n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
375 392
376 n->bi_end_io = bch_data_insert_endio; 393 n->bi_end_io = bch_data_insert_endio;
377 n->bi_private = cl; 394 n->bi_private = cl;
@@ -522,7 +539,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
522 (bio->bi_rw & REQ_WRITE))) 539 (bio->bi_rw & REQ_WRITE)))
523 goto skip; 540 goto skip;
524 541
525 if (bio->bi_sector & (c->sb.block_size - 1) || 542 if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
526 bio_sectors(bio) & (c->sb.block_size - 1)) { 543 bio_sectors(bio) & (c->sb.block_size - 1)) {
527 pr_debug("skipping unaligned io"); 544 pr_debug("skipping unaligned io");
528 goto skip; 545 goto skip;
@@ -546,8 +563,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
546 563
547 spin_lock(&dc->io_lock); 564 spin_lock(&dc->io_lock);
548 565
549 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) 566 hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
550 if (i->last == bio->bi_sector && 567 if (i->last == bio->bi_iter.bi_sector &&
551 time_before(jiffies, i->jiffies)) 568 time_before(jiffies, i->jiffies))
552 goto found; 569 goto found;
553 570
@@ -556,8 +573,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
556 add_sequential(task); 573 add_sequential(task);
557 i->sequential = 0; 574 i->sequential = 0;
558found: 575found:
559 if (i->sequential + bio->bi_size > i->sequential) 576 if (i->sequential + bio->bi_iter.bi_size > i->sequential)
560 i->sequential += bio->bi_size; 577 i->sequential += bio->bi_iter.bi_size;
561 578
562 i->last = bio_end_sector(bio); 579 i->last = bio_end_sector(bio);
563 i->jiffies = jiffies + msecs_to_jiffies(5000); 580 i->jiffies = jiffies + msecs_to_jiffies(5000);
@@ -597,16 +614,13 @@ struct search {
597 /* Stack frame for bio_complete */ 614 /* Stack frame for bio_complete */
598 struct closure cl; 615 struct closure cl;
599 616
600 struct bcache_device *d;
601
602 struct bbio bio; 617 struct bbio bio;
603 struct bio *orig_bio; 618 struct bio *orig_bio;
604 struct bio *cache_miss; 619 struct bio *cache_miss;
620 struct bcache_device *d;
605 621
606 unsigned insert_bio_sectors; 622 unsigned insert_bio_sectors;
607
608 unsigned recoverable:1; 623 unsigned recoverable:1;
609 unsigned unaligned_bvec:1;
610 unsigned write:1; 624 unsigned write:1;
611 unsigned read_dirty_data:1; 625 unsigned read_dirty_data:1;
612 626
@@ -631,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error)
631 645
632 if (error) 646 if (error)
633 s->iop.error = error; 647 s->iop.error = error;
634 else if (ptr_stale(s->iop.c, &b->key, 0)) { 648 else if (!KEY_DIRTY(&b->key) &&
649 ptr_stale(s->iop.c, &b->key, 0)) {
635 atomic_long_inc(&s->iop.c->cache_read_races); 650 atomic_long_inc(&s->iop.c->cache_read_races);
636 s->iop.error = -EINTR; 651 s->iop.error = -EINTR;
637 } 652 }
@@ -650,15 +665,15 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
650 struct bkey *bio_key; 665 struct bkey *bio_key;
651 unsigned ptr; 666 unsigned ptr;
652 667
653 if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0) 668 if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
654 return MAP_CONTINUE; 669 return MAP_CONTINUE;
655 670
656 if (KEY_INODE(k) != s->iop.inode || 671 if (KEY_INODE(k) != s->iop.inode ||
657 KEY_START(k) > bio->bi_sector) { 672 KEY_START(k) > bio->bi_iter.bi_sector) {
658 unsigned bio_sectors = bio_sectors(bio); 673 unsigned bio_sectors = bio_sectors(bio);
659 unsigned sectors = KEY_INODE(k) == s->iop.inode 674 unsigned sectors = KEY_INODE(k) == s->iop.inode
660 ? min_t(uint64_t, INT_MAX, 675 ? min_t(uint64_t, INT_MAX,
661 KEY_START(k) - bio->bi_sector) 676 KEY_START(k) - bio->bi_iter.bi_sector)
662 : INT_MAX; 677 : INT_MAX;
663 678
664 int ret = s->d->cache_miss(b, s, bio, sectors); 679 int ret = s->d->cache_miss(b, s, bio, sectors);
@@ -680,14 +695,14 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
680 if (KEY_DIRTY(k)) 695 if (KEY_DIRTY(k))
681 s->read_dirty_data = true; 696 s->read_dirty_data = true;
682 697
683 n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, 698 n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
684 KEY_OFFSET(k) - bio->bi_sector), 699 KEY_OFFSET(k) - bio->bi_iter.bi_sector),
685 GFP_NOIO, s->d->bio_split); 700 GFP_NOIO, s->d->bio_split);
686 701
687 bio_key = &container_of(n, struct bbio, bio)->key; 702 bio_key = &container_of(n, struct bbio, bio)->key;
688 bch_bkey_copy_single_ptr(bio_key, k, ptr); 703 bch_bkey_copy_single_ptr(bio_key, k, ptr);
689 704
690 bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key); 705 bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
691 bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); 706 bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
692 707
693 n->bi_end_io = bch_cache_read_endio; 708 n->bi_end_io = bch_cache_read_endio;
@@ -712,10 +727,13 @@ static void cache_lookup(struct closure *cl)
712{ 727{
713 struct search *s = container_of(cl, struct search, iop.cl); 728 struct search *s = container_of(cl, struct search, iop.cl);
714 struct bio *bio = &s->bio.bio; 729 struct bio *bio = &s->bio.bio;
730 int ret;
731
732 bch_btree_op_init(&s->op, -1);
715 733
716 int ret = bch_btree_map_keys(&s->op, s->iop.c, 734 ret = bch_btree_map_keys(&s->op, s->iop.c,
717 &KEY(s->iop.inode, bio->bi_sector, 0), 735 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
718 cache_lookup_fn, MAP_END_KEY); 736 cache_lookup_fn, MAP_END_KEY);
719 if (ret == -EAGAIN) 737 if (ret == -EAGAIN)
720 continue_at(cl, cache_lookup, bcache_wq); 738 continue_at(cl, cache_lookup, bcache_wq);
721 739
@@ -756,13 +774,15 @@ static void bio_complete(struct search *s)
756 } 774 }
757} 775}
758 776
759static void do_bio_hook(struct search *s) 777static void do_bio_hook(struct search *s, struct bio *orig_bio)
760{ 778{
761 struct bio *bio = &s->bio.bio; 779 struct bio *bio = &s->bio.bio;
762 memcpy(bio, s->orig_bio, sizeof(struct bio));
763 780
781 bio_init(bio);
782 __bio_clone_fast(bio, orig_bio);
764 bio->bi_end_io = request_endio; 783 bio->bi_end_io = request_endio;
765 bio->bi_private = &s->cl; 784 bio->bi_private = &s->cl;
785
766 atomic_set(&bio->bi_cnt, 3); 786 atomic_set(&bio->bi_cnt, 3);
767} 787}
768 788
@@ -774,43 +794,36 @@ static void search_free(struct closure *cl)
774 if (s->iop.bio) 794 if (s->iop.bio)
775 bio_put(s->iop.bio); 795 bio_put(s->iop.bio);
776 796
777 if (s->unaligned_bvec)
778 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
779
780 closure_debug_destroy(cl); 797 closure_debug_destroy(cl);
781 mempool_free(s, s->d->c->search); 798 mempool_free(s, s->d->c->search);
782} 799}
783 800
784static struct search *search_alloc(struct bio *bio, struct bcache_device *d) 801static inline struct search *search_alloc(struct bio *bio,
802 struct bcache_device *d)
785{ 803{
786 struct search *s; 804 struct search *s;
787 struct bio_vec *bv;
788 805
789 s = mempool_alloc(d->c->search, GFP_NOIO); 806 s = mempool_alloc(d->c->search, GFP_NOIO);
790 memset(s, 0, offsetof(struct search, iop.insert_keys));
791 807
792 __closure_init(&s->cl, NULL); 808 closure_init(&s->cl, NULL);
809 do_bio_hook(s, bio);
793 810
794 s->iop.inode = d->id;
795 s->iop.c = d->c;
796 s->d = d;
797 s->op.lock = -1;
798 s->iop.write_point = hash_long((unsigned long) current, 16);
799 s->orig_bio = bio; 811 s->orig_bio = bio;
800 s->write = (bio->bi_rw & REQ_WRITE) != 0; 812 s->cache_miss = NULL;
801 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 813 s->d = d;
802 s->recoverable = 1; 814 s->recoverable = 1;
815 s->write = (bio->bi_rw & REQ_WRITE) != 0;
816 s->read_dirty_data = 0;
803 s->start_time = jiffies; 817 s->start_time = jiffies;
804 do_bio_hook(s);
805 818
806 if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { 819 s->iop.c = d->c;
807 bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); 820 s->iop.bio = NULL;
808 memcpy(bv, bio_iovec(bio), 821 s->iop.inode = d->id;
809 sizeof(struct bio_vec) * bio_segments(bio)); 822 s->iop.write_point = hash_long((unsigned long) current, 16);
810 823 s->iop.write_prio = 0;
811 s->bio.bio.bi_io_vec = bv; 824 s->iop.error = 0;
812 s->unaligned_bvec = 1; 825 s->iop.flags = 0;
813 } 826 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
814 827
815 return s; 828 return s;
816} 829}
@@ -850,26 +863,13 @@ static void cached_dev_read_error(struct closure *cl)
850{ 863{
851 struct search *s = container_of(cl, struct search, cl); 864 struct search *s = container_of(cl, struct search, cl);
852 struct bio *bio = &s->bio.bio; 865 struct bio *bio = &s->bio.bio;
853 struct bio_vec *bv;
854 int i;
855 866
856 if (s->recoverable) { 867 if (s->recoverable) {
857 /* Retry from the backing device: */ 868 /* Retry from the backing device: */
858 trace_bcache_read_retry(s->orig_bio); 869 trace_bcache_read_retry(s->orig_bio);
859 870
860 s->iop.error = 0; 871 s->iop.error = 0;
861 bv = s->bio.bio.bi_io_vec; 872 do_bio_hook(s, s->orig_bio);
862 do_bio_hook(s);
863 s->bio.bio.bi_io_vec = bv;
864
865 if (!s->unaligned_bvec)
866 bio_for_each_segment(bv, s->orig_bio, i)
867 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
868 else
869 memcpy(s->bio.bio.bi_io_vec,
870 bio_iovec(s->orig_bio),
871 sizeof(struct bio_vec) *
872 bio_segments(s->orig_bio));
873 873
874 /* XXX: invalidate cache */ 874 /* XXX: invalidate cache */
875 875
@@ -894,9 +894,9 @@ static void cached_dev_read_done(struct closure *cl)
894 894
895 if (s->iop.bio) { 895 if (s->iop.bio) {
896 bio_reset(s->iop.bio); 896 bio_reset(s->iop.bio);
897 s->iop.bio->bi_sector = s->cache_miss->bi_sector; 897 s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
898 s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; 898 s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
899 s->iop.bio->bi_size = s->insert_bio_sectors << 9; 899 s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
900 bch_bio_map(s->iop.bio, NULL); 900 bch_bio_map(s->iop.bio, NULL);
901 901
902 bio_copy_data(s->cache_miss, s->iop.bio); 902 bio_copy_data(s->cache_miss, s->iop.bio);
@@ -905,8 +905,7 @@ static void cached_dev_read_done(struct closure *cl)
905 s->cache_miss = NULL; 905 s->cache_miss = NULL;
906 } 906 }
907 907
908 if (verify(dc, &s->bio.bio) && s->recoverable && 908 if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
909 !s->unaligned_bvec && !s->read_dirty_data)
910 bch_data_verify(dc, s->orig_bio); 909 bch_data_verify(dc, s->orig_bio);
911 910
912 bio_complete(s); 911 bio_complete(s);
@@ -946,7 +945,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
946 struct bio *miss, *cache_bio; 945 struct bio *miss, *cache_bio;
947 946
948 if (s->cache_miss || s->iop.bypass) { 947 if (s->cache_miss || s->iop.bypass) {
949 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 948 miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
950 ret = miss == bio ? MAP_DONE : MAP_CONTINUE; 949 ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
951 goto out_submit; 950 goto out_submit;
952 } 951 }
@@ -960,7 +959,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
960 s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); 959 s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
961 960
962 s->iop.replace_key = KEY(s->iop.inode, 961 s->iop.replace_key = KEY(s->iop.inode,
963 bio->bi_sector + s->insert_bio_sectors, 962 bio->bi_iter.bi_sector + s->insert_bio_sectors,
964 s->insert_bio_sectors); 963 s->insert_bio_sectors);
965 964
966 ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); 965 ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
@@ -969,7 +968,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
969 968
970 s->iop.replace = true; 969 s->iop.replace = true;
971 970
972 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 971 miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
973 972
974 /* btree_search_recurse()'s btree iterator is no good anymore */ 973 /* btree_search_recurse()'s btree iterator is no good anymore */
975 ret = miss == bio ? MAP_DONE : -EINTR; 974 ret = miss == bio ? MAP_DONE : -EINTR;
@@ -980,9 +979,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
980 if (!cache_bio) 979 if (!cache_bio)
981 goto out_submit; 980 goto out_submit;
982 981
983 cache_bio->bi_sector = miss->bi_sector; 982 cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector;
984 cache_bio->bi_bdev = miss->bi_bdev; 983 cache_bio->bi_bdev = miss->bi_bdev;
985 cache_bio->bi_size = s->insert_bio_sectors << 9; 984 cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
986 985
987 cache_bio->bi_end_io = request_endio; 986 cache_bio->bi_end_io = request_endio;
988 cache_bio->bi_private = &s->cl; 987 cache_bio->bi_private = &s->cl;
@@ -1032,7 +1031,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
1032{ 1031{
1033 struct closure *cl = &s->cl; 1032 struct closure *cl = &s->cl;
1034 struct bio *bio = &s->bio.bio; 1033 struct bio *bio = &s->bio.bio;
1035 struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); 1034 struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
1036 struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); 1035 struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
1037 1036
1038 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); 1037 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
@@ -1088,8 +1087,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
1088 closure_bio_submit(flush, cl, s->d); 1087 closure_bio_submit(flush, cl, s->d);
1089 } 1088 }
1090 } else { 1089 } else {
1091 s->iop.bio = bio_clone_bioset(bio, GFP_NOIO, 1090 s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
1092 dc->disk.bio_split);
1093 1091
1094 closure_bio_submit(bio, cl, s->d); 1092 closure_bio_submit(bio, cl, s->d);
1095 } 1093 }
@@ -1127,13 +1125,13 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1127 part_stat_unlock(); 1125 part_stat_unlock();
1128 1126
1129 bio->bi_bdev = dc->bdev; 1127 bio->bi_bdev = dc->bdev;
1130 bio->bi_sector += dc->sb.data_offset; 1128 bio->bi_iter.bi_sector += dc->sb.data_offset;
1131 1129
1132 if (cached_dev_get(dc)) { 1130 if (cached_dev_get(dc)) {
1133 s = search_alloc(bio, d); 1131 s = search_alloc(bio, d);
1134 trace_bcache_request_start(s->d, bio); 1132 trace_bcache_request_start(s->d, bio);
1135 1133
1136 if (!bio->bi_size) { 1134 if (!bio->bi_iter.bi_size) {
1137 /* 1135 /*
1138 * can't call bch_journal_meta from under 1136 * can't call bch_journal_meta from under
1139 * generic_make_request 1137 * generic_make_request
@@ -1205,24 +1203,24 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
1205static int flash_dev_cache_miss(struct btree *b, struct search *s, 1203static int flash_dev_cache_miss(struct btree *b, struct search *s,
1206 struct bio *bio, unsigned sectors) 1204 struct bio *bio, unsigned sectors)
1207{ 1205{
1208 struct bio_vec *bv; 1206 struct bio_vec bv;
1209 int i; 1207 struct bvec_iter iter;
1210 1208
1211 /* Zero fill bio */ 1209 /* Zero fill bio */
1212 1210
1213 bio_for_each_segment(bv, bio, i) { 1211 bio_for_each_segment(bv, bio, iter) {
1214 unsigned j = min(bv->bv_len >> 9, sectors); 1212 unsigned j = min(bv.bv_len >> 9, sectors);
1215 1213
1216 void *p = kmap(bv->bv_page); 1214 void *p = kmap(bv.bv_page);
1217 memset(p + bv->bv_offset, 0, j << 9); 1215 memset(p + bv.bv_offset, 0, j << 9);
1218 kunmap(bv->bv_page); 1216 kunmap(bv.bv_page);
1219 1217
1220 sectors -= j; 1218 sectors -= j;
1221 } 1219 }
1222 1220
1223 bio_advance(bio, min(sectors << 9, bio->bi_size)); 1221 bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
1224 1222
1225 if (!bio->bi_size) 1223 if (!bio->bi_iter.bi_size)
1226 return MAP_DONE; 1224 return MAP_DONE;
1227 1225
1228 return MAP_CONTINUE; 1226 return MAP_CONTINUE;
@@ -1256,7 +1254,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1256 1254
1257 trace_bcache_request_start(s->d, bio); 1255 trace_bcache_request_start(s->d, bio);
1258 1256
1259 if (!bio->bi_size) { 1257 if (!bio->bi_iter.bi_size) {
1260 /* 1258 /*
1261 * can't call bch_journal_meta from under 1259 * can't call bch_journal_meta from under
1262 * generic_make_request 1260 * generic_make_request
@@ -1266,7 +1264,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1266 bcache_wq); 1264 bcache_wq);
1267 } else if (rw) { 1265 } else if (rw) {
1268 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, 1266 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
1269 &KEY(d->id, bio->bi_sector, 0), 1267 &KEY(d->id, bio->bi_iter.bi_sector, 0),
1270 &KEY(d->id, bio_end_sector(bio), 0)); 1268 &KEY(d->id, bio_end_sector(bio), 0));
1271 1269
1272 s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; 1270 s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 2cd65bf073c2..39f21dbedc38 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -13,17 +13,22 @@ struct data_insert_op {
13 uint16_t write_prio; 13 uint16_t write_prio;
14 short error; 14 short error;
15 15
16 unsigned bypass:1; 16 union {
17 unsigned writeback:1; 17 uint16_t flags;
18 unsigned flush_journal:1;
19 unsigned csum:1;
20 18
21 unsigned replace:1; 19 struct {
22 unsigned replace_collision:1; 20 unsigned bypass:1;
21 unsigned writeback:1;
22 unsigned flush_journal:1;
23 unsigned csum:1;
23 24
24 unsigned insert_data_done:1; 25 unsigned replace:1;
26 unsigned replace_collision:1;
27
28 unsigned insert_data_done:1;
29 };
30 };
25 31
26 /* Anything past this point won't get zeroed in search_alloc() */
27 struct keylist insert_keys; 32 struct keylist insert_keys;
28 BKEY_PADDED(replace_key); 33 BKEY_PADDED(replace_key);
29}; 34};
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c57bfa071a57..24a3a1546caa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -9,6 +9,7 @@
9#include "bcache.h" 9#include "bcache.h"
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "extents.h"
12#include "request.h" 13#include "request.h"
13#include "writeback.h" 14#include "writeback.h"
14 15
@@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error)
225 struct cached_dev *dc = bio->bi_private; 226 struct cached_dev *dc = bio->bi_private;
226 /* XXX: error checking */ 227 /* XXX: error checking */
227 228
228 closure_put(&dc->sb_write.cl); 229 closure_put(&dc->sb_write);
229} 230}
230 231
231static void __write_super(struct cache_sb *sb, struct bio *bio) 232static void __write_super(struct cache_sb *sb, struct bio *bio)
@@ -233,9 +234,9 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
233 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); 234 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
234 unsigned i; 235 unsigned i;
235 236
236 bio->bi_sector = SB_SECTOR; 237 bio->bi_iter.bi_sector = SB_SECTOR;
237 bio->bi_rw = REQ_SYNC|REQ_META; 238 bio->bi_rw = REQ_SYNC|REQ_META;
238 bio->bi_size = SB_SIZE; 239 bio->bi_iter.bi_size = SB_SIZE;
239 bch_bio_map(bio, NULL); 240 bch_bio_map(bio, NULL);
240 241
241 out->offset = cpu_to_le64(sb->offset); 242 out->offset = cpu_to_le64(sb->offset);
@@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
263 submit_bio(REQ_WRITE, bio); 264 submit_bio(REQ_WRITE, bio);
264} 265}
265 266
267static void bch_write_bdev_super_unlock(struct closure *cl)
268{
269 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
270
271 up(&dc->sb_write_mutex);
272}
273
266void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) 274void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
267{ 275{
268 struct closure *cl = &dc->sb_write.cl; 276 struct closure *cl = &dc->sb_write;
269 struct bio *bio = &dc->sb_bio; 277 struct bio *bio = &dc->sb_bio;
270 278
271 closure_lock(&dc->sb_write, parent); 279 down(&dc->sb_write_mutex);
280 closure_init(cl, parent);
272 281
273 bio_reset(bio); 282 bio_reset(bio);
274 bio->bi_bdev = dc->bdev; 283 bio->bi_bdev = dc->bdev;
@@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
278 closure_get(cl); 287 closure_get(cl);
279 __write_super(&dc->sb, bio); 288 __write_super(&dc->sb, bio);
280 289
281 closure_return(cl); 290 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
282} 291}
283 292
284static void write_super_endio(struct bio *bio, int error) 293static void write_super_endio(struct bio *bio, int error)
@@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error)
286 struct cache *ca = bio->bi_private; 295 struct cache *ca = bio->bi_private;
287 296
288 bch_count_io_errors(ca, error, "writing superblock"); 297 bch_count_io_errors(ca, error, "writing superblock");
289 closure_put(&ca->set->sb_write.cl); 298 closure_put(&ca->set->sb_write);
299}
300
301static void bcache_write_super_unlock(struct closure *cl)
302{
303 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
304
305 up(&c->sb_write_mutex);
290} 306}
291 307
292void bcache_write_super(struct cache_set *c) 308void bcache_write_super(struct cache_set *c)
293{ 309{
294 struct closure *cl = &c->sb_write.cl; 310 struct closure *cl = &c->sb_write;
295 struct cache *ca; 311 struct cache *ca;
296 unsigned i; 312 unsigned i;
297 313
298 closure_lock(&c->sb_write, &c->cl); 314 down(&c->sb_write_mutex);
315 closure_init(cl, &c->cl);
299 316
300 c->sb.seq++; 317 c->sb.seq++;
301 318
@@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c)
317 __write_super(&ca->sb, bio); 334 __write_super(&ca->sb, bio);
318 } 335 }
319 336
320 closure_return(cl); 337 closure_return_with_destructor(cl, bcache_write_super_unlock);
321} 338}
322 339
323/* UUID io */ 340/* UUID io */
@@ -325,29 +342,37 @@ void bcache_write_super(struct cache_set *c)
325static void uuid_endio(struct bio *bio, int error) 342static void uuid_endio(struct bio *bio, int error)
326{ 343{
327 struct closure *cl = bio->bi_private; 344 struct closure *cl = bio->bi_private;
328 struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); 345 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
329 346
330 cache_set_err_on(error, c, "accessing uuids"); 347 cache_set_err_on(error, c, "accessing uuids");
331 bch_bbio_free(bio, c); 348 bch_bbio_free(bio, c);
332 closure_put(cl); 349 closure_put(cl);
333} 350}
334 351
352static void uuid_io_unlock(struct closure *cl)
353{
354 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
355
356 up(&c->uuid_write_mutex);
357}
358
335static void uuid_io(struct cache_set *c, unsigned long rw, 359static void uuid_io(struct cache_set *c, unsigned long rw,
336 struct bkey *k, struct closure *parent) 360 struct bkey *k, struct closure *parent)
337{ 361{
338 struct closure *cl = &c->uuid_write.cl; 362 struct closure *cl = &c->uuid_write;
339 struct uuid_entry *u; 363 struct uuid_entry *u;
340 unsigned i; 364 unsigned i;
341 char buf[80]; 365 char buf[80];
342 366
343 BUG_ON(!parent); 367 BUG_ON(!parent);
344 closure_lock(&c->uuid_write, parent); 368 down(&c->uuid_write_mutex);
369 closure_init(cl, parent);
345 370
346 for (i = 0; i < KEY_PTRS(k); i++) { 371 for (i = 0; i < KEY_PTRS(k); i++) {
347 struct bio *bio = bch_bbio_alloc(c); 372 struct bio *bio = bch_bbio_alloc(c);
348 373
349 bio->bi_rw = REQ_SYNC|REQ_META|rw; 374 bio->bi_rw = REQ_SYNC|REQ_META|rw;
350 bio->bi_size = KEY_SIZE(k) << 9; 375 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
351 376
352 bio->bi_end_io = uuid_endio; 377 bio->bi_end_io = uuid_endio;
353 bio->bi_private = cl; 378 bio->bi_private = cl;
@@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
359 break; 384 break;
360 } 385 }
361 386
362 bch_bkey_to_text(buf, sizeof(buf), k); 387 bch_extent_to_text(buf, sizeof(buf), k);
363 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); 388 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
364 389
365 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) 390 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
@@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
368 u - c->uuids, u->uuid, u->label, 393 u - c->uuids, u->uuid, u->label,
369 u->first_reg, u->last_reg, u->invalidated); 394 u->first_reg, u->last_reg, u->invalidated);
370 395
371 closure_return(cl); 396 closure_return_with_destructor(cl, uuid_io_unlock);
372} 397}
373 398
374static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) 399static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
375{ 400{
376 struct bkey *k = &j->uuid_bucket; 401 struct bkey *k = &j->uuid_bucket;
377 402
378 if (bch_btree_ptr_invalid(c, k)) 403 if (__bch_btree_ptr_invalid(c, k))
379 return "bad uuid pointer"; 404 return "bad uuid pointer";
380 405
381 bkey_copy(&c->uuid_bucket, k); 406 bkey_copy(&c->uuid_bucket, k);
@@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c)
420 445
421 lockdep_assert_held(&bch_register_lock); 446 lockdep_assert_held(&bch_register_lock);
422 447
423 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) 448 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
424 return 1; 449 return 1;
425 450
426 SET_KEY_SIZE(&k.key, c->sb.bucket_size); 451 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -503,10 +528,10 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
503 528
504 closure_init_stack(cl); 529 closure_init_stack(cl);
505 530
506 bio->bi_sector = bucket * ca->sb.bucket_size; 531 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
507 bio->bi_bdev = ca->bdev; 532 bio->bi_bdev = ca->bdev;
508 bio->bi_rw = REQ_SYNC|REQ_META|rw; 533 bio->bi_rw = REQ_SYNC|REQ_META|rw;
509 bio->bi_size = bucket_bytes(ca); 534 bio->bi_iter.bi_size = bucket_bytes(ca);
510 535
511 bio->bi_end_io = prio_endio; 536 bio->bi_end_io = prio_endio;
512 bio->bi_private = ca; 537 bio->bi_private = ca;
@@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca)
538 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), 563 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
539 &ca->meta_sectors_written); 564 &ca->meta_sectors_written);
540 565
541 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), 566 //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
542 fifo_used(&ca->free_inc), fifo_used(&ca->unused)); 567 // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
543 568
544 for (i = prio_buckets(ca) - 1; i >= 0; --i) { 569 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
545 long bucket; 570 long bucket;
@@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca)
558 p->magic = pset_magic(&ca->sb); 583 p->magic = pset_magic(&ca->sb);
559 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); 584 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
560 585
561 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); 586 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
562 BUG_ON(bucket == -1); 587 BUG_ON(bucket == -1);
563 588
564 mutex_unlock(&ca->set->bucket_lock); 589 mutex_unlock(&ca->set->bucket_lock);
@@ -739,8 +764,6 @@ static void bcache_device_free(struct bcache_device *d)
739 } 764 }
740 765
741 bio_split_pool_free(&d->bio_split_hook); 766 bio_split_pool_free(&d->bio_split_hook);
742 if (d->unaligned_bvec)
743 mempool_destroy(d->unaligned_bvec);
744 if (d->bio_split) 767 if (d->bio_split)
745 bioset_free(d->bio_split); 768 bioset_free(d->bio_split);
746 if (is_vmalloc_addr(d->full_dirty_stripes)) 769 if (is_vmalloc_addr(d->full_dirty_stripes))
@@ -793,8 +816,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
793 return minor; 816 return minor;
794 817
795 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 818 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
796 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
797 sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
798 bio_split_pool_init(&d->bio_split_hook) || 819 bio_split_pool_init(&d->bio_split_hook) ||
799 !(d->disk = alloc_disk(1))) { 820 !(d->disk = alloc_disk(1))) {
800 ida_simple_remove(&bcache_minor, minor); 821 ida_simple_remove(&bcache_minor, minor);
@@ -1102,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1102 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); 1123 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1103 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); 1124 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1104 INIT_WORK(&dc->detach, cached_dev_detach_finish); 1125 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1105 closure_init_unlocked(&dc->sb_write); 1126 sema_init(&dc->sb_write_mutex, 1);
1106 INIT_LIST_HEAD(&dc->io_lru); 1127 INIT_LIST_HEAD(&dc->io_lru);
1107 spin_lock_init(&dc->io_lock); 1128 spin_lock_init(&dc->io_lock);
1108 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); 1129 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
@@ -1114,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1114 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); 1135 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1115 } 1136 }
1116 1137
1138 dc->disk.stripe_size = q->limits.io_opt >> 9;
1139
1140 if (dc->disk.stripe_size)
1141 dc->partial_stripes_expensive =
1142 q->limits.raid_partial_stripes_expensive;
1143
1117 ret = bcache_device_init(&dc->disk, block_size, 1144 ret = bcache_device_init(&dc->disk, block_size,
1118 dc->bdev->bd_part->nr_sects - dc->sb.data_offset); 1145 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1119 if (ret) 1146 if (ret)
@@ -1325,8 +1352,8 @@ static void cache_set_free(struct closure *cl)
1325 if (ca) 1352 if (ca)
1326 kobject_put(&ca->kobj); 1353 kobject_put(&ca->kobj);
1327 1354
1355 bch_bset_sort_state_free(&c->sort);
1328 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1356 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1329 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1330 1357
1331 if (c->bio_split) 1358 if (c->bio_split)
1332 bioset_free(c->bio_split); 1359 bioset_free(c->bio_split);
@@ -1451,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1451 c->block_bits = ilog2(sb->block_size); 1478 c->block_bits = ilog2(sb->block_size);
1452 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); 1479 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1453 1480
1454 c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; 1481 c->btree_pages = bucket_pages(c);
1455 if (c->btree_pages > BTREE_MAX_PAGES) 1482 if (c->btree_pages > BTREE_MAX_PAGES)
1456 c->btree_pages = max_t(int, c->btree_pages / 4, 1483 c->btree_pages = max_t(int, c->btree_pages / 4,
1457 BTREE_MAX_PAGES); 1484 BTREE_MAX_PAGES);
1458 1485
1459 c->sort_crit_factor = int_sqrt(c->btree_pages); 1486 sema_init(&c->sb_write_mutex, 1);
1460
1461 closure_init_unlocked(&c->sb_write);
1462 mutex_init(&c->bucket_lock); 1487 mutex_init(&c->bucket_lock);
1463 init_waitqueue_head(&c->try_wait); 1488 init_waitqueue_head(&c->try_wait);
1464 init_waitqueue_head(&c->bucket_wait); 1489 init_waitqueue_head(&c->bucket_wait);
1465 closure_init_unlocked(&c->uuid_write); 1490 sema_init(&c->uuid_write_mutex, 1);
1466 mutex_init(&c->sort_lock);
1467 1491
1468 spin_lock_init(&c->sort_time.lock);
1469 spin_lock_init(&c->btree_gc_time.lock); 1492 spin_lock_init(&c->btree_gc_time.lock);
1470 spin_lock_init(&c->btree_split_time.lock); 1493 spin_lock_init(&c->btree_split_time.lock);
1471 spin_lock_init(&c->btree_read_time.lock); 1494 spin_lock_init(&c->btree_read_time.lock);
@@ -1493,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1493 bucket_pages(c))) || 1516 bucket_pages(c))) ||
1494 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || 1517 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1495 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1518 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1496 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1497 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1519 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1498 bch_journal_alloc(c) || 1520 bch_journal_alloc(c) ||
1499 bch_btree_cache_alloc(c) || 1521 bch_btree_cache_alloc(c) ||
1500 bch_open_buckets_alloc(c)) 1522 bch_open_buckets_alloc(c) ||
1523 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1501 goto err; 1524 goto err;
1502 1525
1503 c->congested_read_threshold_us = 2000; 1526 c->congested_read_threshold_us = 2000;
@@ -1553,7 +1576,7 @@ static void run_cache_set(struct cache_set *c)
1553 k = &j->btree_root; 1576 k = &j->btree_root;
1554 1577
1555 err = "bad btree root"; 1578 err = "bad btree root";
1556 if (bch_btree_ptr_invalid(c, k)) 1579 if (__bch_btree_ptr_invalid(c, k))
1557 goto err; 1580 goto err;
1558 1581
1559 err = "error reading btree root"; 1582 err = "error reading btree root";
@@ -1747,6 +1770,7 @@ err:
1747void bch_cache_release(struct kobject *kobj) 1770void bch_cache_release(struct kobject *kobj)
1748{ 1771{
1749 struct cache *ca = container_of(kobj, struct cache, kobj); 1772 struct cache *ca = container_of(kobj, struct cache, kobj);
1773 unsigned i;
1750 1774
1751 if (ca->set) 1775 if (ca->set)
1752 ca->set->cache[ca->sb.nr_this_dev] = NULL; 1776 ca->set->cache[ca->sb.nr_this_dev] = NULL;
@@ -1760,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj)
1760 free_heap(&ca->heap); 1784 free_heap(&ca->heap);
1761 free_fifo(&ca->unused); 1785 free_fifo(&ca->unused);
1762 free_fifo(&ca->free_inc); 1786 free_fifo(&ca->free_inc);
1763 free_fifo(&ca->free); 1787
1788 for (i = 0; i < RESERVE_NR; i++)
1789 free_fifo(&ca->free[i]);
1764 1790
1765 if (ca->sb_bio.bi_inline_vecs[0].bv_page) 1791 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1766 put_page(ca->sb_bio.bi_io_vec[0].bv_page); 1792 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
@@ -1786,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1786 ca->journal.bio.bi_max_vecs = 8; 1812 ca->journal.bio.bi_max_vecs = 8;
1787 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; 1813 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1788 1814
1789 free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; 1815 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1790 free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
1791 1816
1792 if (!init_fifo(&ca->free, free, GFP_KERNEL) || 1817 if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
1818 !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1819 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1820 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1793 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || 1821 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1794 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || 1822 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
1795 !init_heap(&ca->heap, free << 3, GFP_KERNEL) || 1823 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
@@ -2034,7 +2062,8 @@ static void bcache_exit(void)
2034 kobject_put(bcache_kobj); 2062 kobject_put(bcache_kobj);
2035 if (bcache_wq) 2063 if (bcache_wq)
2036 destroy_workqueue(bcache_wq); 2064 destroy_workqueue(bcache_wq);
2037 unregister_blkdev(bcache_major, "bcache"); 2065 if (bcache_major)
2066 unregister_blkdev(bcache_major, "bcache");
2038 unregister_reboot_notifier(&reboot); 2067 unregister_reboot_notifier(&reboot);
2039} 2068}
2040 2069
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a1f85612f0b3..d8458d477a12 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test);
102rw_attribute(key_merging_disabled); 102rw_attribute(key_merging_disabled);
103rw_attribute(gc_always_rewrite); 103rw_attribute(gc_always_rewrite);
104rw_attribute(expensive_debug_checks); 104rw_attribute(expensive_debug_checks);
105rw_attribute(freelist_percent);
106rw_attribute(cache_replacement_policy); 105rw_attribute(cache_replacement_policy);
107rw_attribute(btree_shrinker_disabled); 106rw_attribute(btree_shrinker_disabled);
108rw_attribute(copy_gc_enabled); 107rw_attribute(copy_gc_enabled);
@@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = {
401}; 400};
402KTYPE(bch_flash_dev); 401KTYPE(bch_flash_dev);
403 402
403struct bset_stats_op {
404 struct btree_op op;
405 size_t nodes;
406 struct bset_stats stats;
407};
408
409static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
410{
411 struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
412
413 op->nodes++;
414 bch_btree_keys_stats(&b->keys, &op->stats);
415
416 return MAP_CONTINUE;
417}
418
419static int bch_bset_print_stats(struct cache_set *c, char *buf)
420{
421 struct bset_stats_op op;
422 int ret;
423
424 memset(&op, 0, sizeof(op));
425 bch_btree_op_init(&op.op, -1);
426
427 ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
428 if (ret < 0)
429 return ret;
430
431 return snprintf(buf, PAGE_SIZE,
432 "btree nodes: %zu\n"
433 "written sets: %zu\n"
434 "unwritten sets: %zu\n"
435 "written key bytes: %zu\n"
436 "unwritten key bytes: %zu\n"
437 "floats: %zu\n"
438 "failed: %zu\n",
439 op.nodes,
440 op.stats.sets_written, op.stats.sets_unwritten,
441 op.stats.bytes_written, op.stats.bytes_unwritten,
442 op.stats.floats, op.stats.failed);
443}
444
404SHOW(__bch_cache_set) 445SHOW(__bch_cache_set)
405{ 446{
406 unsigned root_usage(struct cache_set *c) 447 unsigned root_usage(struct cache_set *c)
@@ -419,7 +460,7 @@ lock_root:
419 rw_lock(false, b, b->level); 460 rw_lock(false, b, b->level);
420 } while (b != c->root); 461 } while (b != c->root);
421 462
422 for_each_key_filter(b, k, &iter, bch_ptr_bad) 463 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
423 bytes += bkey_bytes(k); 464 bytes += bkey_bytes(k);
424 465
425 rw_unlock(false, b); 466 rw_unlock(false, b);
@@ -434,7 +475,7 @@ lock_root:
434 475
435 mutex_lock(&c->bucket_lock); 476 mutex_lock(&c->bucket_lock);
436 list_for_each_entry(b, &c->btree_cache, list) 477 list_for_each_entry(b, &c->btree_cache, list)
437 ret += 1 << (b->page_order + PAGE_SHIFT); 478 ret += 1 << (b->keys.page_order + PAGE_SHIFT);
438 479
439 mutex_unlock(&c->bucket_lock); 480 mutex_unlock(&c->bucket_lock);
440 return ret; 481 return ret;
@@ -491,7 +532,7 @@ lock_root:
491 532
492 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); 533 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
493 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); 534 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
494 sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); 535 sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
495 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); 536 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
496 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); 537 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
497 538
@@ -711,9 +752,6 @@ SHOW(__bch_cache)
711 sysfs_print(io_errors, 752 sysfs_print(io_errors,
712 atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); 753 atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
713 754
714 sysfs_print(freelist_percent, ca->free.size * 100 /
715 ((size_t) ca->sb.nbuckets));
716
717 if (attr == &sysfs_cache_replacement_policy) 755 if (attr == &sysfs_cache_replacement_policy)
718 return bch_snprint_string_list(buf, PAGE_SIZE, 756 return bch_snprint_string_list(buf, PAGE_SIZE,
719 cache_replacement_policies, 757 cache_replacement_policies,
@@ -820,32 +858,6 @@ STORE(__bch_cache)
820 } 858 }
821 } 859 }
822 860
823 if (attr == &sysfs_freelist_percent) {
824 DECLARE_FIFO(long, free);
825 long i;
826 size_t p = strtoul_or_return(buf);
827
828 p = clamp_t(size_t,
829 ((size_t) ca->sb.nbuckets * p) / 100,
830 roundup_pow_of_two(ca->sb.nbuckets) >> 9,
831 ca->sb.nbuckets / 2);
832
833 if (!init_fifo_exact(&free, p, GFP_KERNEL))
834 return -ENOMEM;
835
836 mutex_lock(&ca->set->bucket_lock);
837
838 fifo_move(&free, &ca->free);
839 fifo_swap(&free, &ca->free);
840
841 mutex_unlock(&ca->set->bucket_lock);
842
843 while (fifo_pop(&free, i))
844 atomic_dec(&ca->buckets[i].pin);
845
846 free_fifo(&free);
847 }
848
849 if (attr == &sysfs_clear_stats) { 861 if (attr == &sysfs_clear_stats) {
850 atomic_long_set(&ca->sectors_written, 0); 862 atomic_long_set(&ca->sectors_written, 0);
851 atomic_long_set(&ca->btree_sectors_written, 0); 863 atomic_long_set(&ca->btree_sectors_written, 0);
@@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = {
869 &sysfs_metadata_written, 881 &sysfs_metadata_written,
870 &sysfs_io_errors, 882 &sysfs_io_errors,
871 &sysfs_clear_stats, 883 &sysfs_clear_stats,
872 &sysfs_freelist_percent,
873 &sysfs_cache_replacement_policy, 884 &sysfs_cache_replacement_policy,
874 NULL 885 NULL
875}; 886};
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index bb37618e7664..db3ae4c2b223 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -224,10 +224,10 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
224 224
225void bch_bio_map(struct bio *bio, void *base) 225void bch_bio_map(struct bio *bio, void *base)
226{ 226{
227 size_t size = bio->bi_size; 227 size_t size = bio->bi_iter.bi_size;
228 struct bio_vec *bv = bio->bi_io_vec; 228 struct bio_vec *bv = bio->bi_io_vec;
229 229
230 BUG_ON(!bio->bi_size); 230 BUG_ON(!bio->bi_iter.bi_size);
231 BUG_ON(bio->bi_vcnt); 231 BUG_ON(bio->bi_vcnt);
232 232
233 bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; 233 bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1030c6020e98..ac7d0d1f70d7 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -2,6 +2,7 @@
2#ifndef _BCACHE_UTIL_H 2#ifndef _BCACHE_UTIL_H
3#define _BCACHE_UTIL_H 3#define _BCACHE_UTIL_H
4 4
5#include <linux/blkdev.h>
5#include <linux/errno.h> 6#include <linux/errno.h>
6#include <linux/kernel.h> 7#include <linux/kernel.h>
7#include <linux/llist.h> 8#include <linux/llist.h>
@@ -17,11 +18,13 @@ struct closure;
17 18
18#ifdef CONFIG_BCACHE_DEBUG 19#ifdef CONFIG_BCACHE_DEBUG
19 20
21#define EBUG_ON(cond) BUG_ON(cond)
20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 22#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
21#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) 23#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
22 24
23#else /* DEBUG */ 25#else /* DEBUG */
24 26
27#define EBUG_ON(cond) do { if (cond); } while (0)
25#define atomic_dec_bug(v) atomic_dec(v) 28#define atomic_dec_bug(v) atomic_dec(v)
26#define atomic_inc_bug(v, i) atomic_inc(v) 29#define atomic_inc_bug(v, i) atomic_inc(v)
27 30
@@ -391,6 +394,11 @@ struct time_stats {
391 394
392void bch_time_stats_update(struct time_stats *stats, uint64_t time); 395void bch_time_stats_update(struct time_stats *stats, uint64_t time);
393 396
397static inline unsigned local_clock_us(void)
398{
399 return local_clock() >> 10;
400}
401
394#define NSEC_PER_ns 1L 402#define NSEC_PER_ns 1L
395#define NSEC_PER_us NSEC_PER_USEC 403#define NSEC_PER_us NSEC_PER_USEC
396#define NSEC_PER_ms NSEC_PER_MSEC 404#define NSEC_PER_ms NSEC_PER_MSEC
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 6c44fe059c27..f4300e4c0114 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -111,7 +111,7 @@ static void dirty_init(struct keybuf_key *w)
111 if (!io->dc->writeback_percent) 111 if (!io->dc->writeback_percent)
112 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 112 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
113 113
114 bio->bi_size = KEY_SIZE(&w->key) << 9; 114 bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
115 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); 115 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
116 bio->bi_private = w; 116 bio->bi_private = w;
117 bio->bi_io_vec = bio->bi_inline_vecs; 117 bio->bi_io_vec = bio->bi_inline_vecs;
@@ -184,7 +184,7 @@ static void write_dirty(struct closure *cl)
184 184
185 dirty_init(w); 185 dirty_init(w);
186 io->bio.bi_rw = WRITE; 186 io->bio.bi_rw = WRITE;
187 io->bio.bi_sector = KEY_START(&w->key); 187 io->bio.bi_iter.bi_sector = KEY_START(&w->key);
188 io->bio.bi_bdev = io->dc->bdev; 188 io->bio.bi_bdev = io->dc->bdev;
189 io->bio.bi_end_io = dirty_endio; 189 io->bio.bi_end_io = dirty_endio;
190 190
@@ -253,7 +253,7 @@ static void read_dirty(struct cached_dev *dc)
253 io->dc = dc; 253 io->dc = dc;
254 254
255 dirty_init(w); 255 dirty_init(w);
256 io->bio.bi_sector = PTR_OFFSET(&w->key, 0); 256 io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
257 io->bio.bi_bdev = PTR_CACHE(dc->disk.c, 257 io->bio.bi_bdev = PTR_CACHE(dc->disk.c,
258 &w->key, 0)->bdev; 258 &w->key, 0)->bdev;
259 io->bio.bi_rw = READ; 259 io->bio.bi_rw = READ;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c9ddcf4614b9..e2f8598937ac 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -50,7 +50,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
50 return false; 50 return false;
51 51
52 if (dc->partial_stripes_expensive && 52 if (dc->partial_stripes_expensive &&
53 bcache_dev_stripe_dirty(dc, bio->bi_sector, 53 bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
54 bio_sectors(bio))) 54 bio_sectors(bio)))
55 return true; 55 return true;
56 56
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 12dc29ba7399..4195a01b1535 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1635,7 +1635,7 @@ int bitmap_create(struct mddev *mddev)
1635 sector_t blocks = mddev->resync_max_sectors; 1635 sector_t blocks = mddev->resync_max_sectors;
1636 struct file *file = mddev->bitmap_info.file; 1636 struct file *file = mddev->bitmap_info.file;
1637 int err; 1637 int err;
1638 struct sysfs_dirent *bm = NULL; 1638 struct kernfs_node *bm = NULL;
1639 1639
1640 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1640 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1641 1641
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index df4aeb6ac6f0..30210b9c4ef9 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -225,7 +225,7 @@ struct bitmap {
225 wait_queue_head_t overflow_wait; 225 wait_queue_head_t overflow_wait;
226 wait_queue_head_t behind_wait; 226 wait_queue_head_t behind_wait;
227 227
228 struct sysfs_dirent *sysfs_can_clear; 228 struct kernfs_node *sysfs_can_clear;
229}; 229};
230 230
231/* the bitmap API */ 231/* the bitmap API */
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 3a8cfa2645c7..dd3646111561 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -17,55 +17,24 @@
17 * original bio state. 17 * original bio state.
18 */ 18 */
19 19
20struct dm_bio_vec_details {
21#if PAGE_SIZE < 65536
22 __u16 bv_len;
23 __u16 bv_offset;
24#else
25 unsigned bv_len;
26 unsigned bv_offset;
27#endif
28};
29
30struct dm_bio_details { 20struct dm_bio_details {
31 sector_t bi_sector;
32 struct block_device *bi_bdev; 21 struct block_device *bi_bdev;
33 unsigned int bi_size;
34 unsigned short bi_idx;
35 unsigned long bi_flags; 22 unsigned long bi_flags;
36 struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES]; 23 struct bvec_iter bi_iter;
37}; 24};
38 25
39static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) 26static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
40{ 27{
41 unsigned i;
42
43 bd->bi_sector = bio->bi_sector;
44 bd->bi_bdev = bio->bi_bdev; 28 bd->bi_bdev = bio->bi_bdev;
45 bd->bi_size = bio->bi_size;
46 bd->bi_idx = bio->bi_idx;
47 bd->bi_flags = bio->bi_flags; 29 bd->bi_flags = bio->bi_flags;
48 30 bd->bi_iter = bio->bi_iter;
49 for (i = 0; i < bio->bi_vcnt; i++) {
50 bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
51 bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
52 }
53} 31}
54 32
55static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) 33static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
56{ 34{
57 unsigned i;
58
59 bio->bi_sector = bd->bi_sector;
60 bio->bi_bdev = bd->bi_bdev; 35 bio->bi_bdev = bd->bi_bdev;
61 bio->bi_size = bd->bi_size;
62 bio->bi_idx = bd->bi_idx;
63 bio->bi_flags = bd->bi_flags; 36 bio->bi_flags = bd->bi_flags;
64 37 bio->bi_iter = bd->bi_iter;
65 for (i = 0; i < bio->bi_vcnt; i++) {
66 bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
67 bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
68 }
69} 38}
70 39
71#endif 40#endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 54bdd923316f..66c5d130c8c2 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -104,6 +104,8 @@ struct dm_bufio_client {
104 struct list_head reserved_buffers; 104 struct list_head reserved_buffers;
105 unsigned need_reserved_buffers; 105 unsigned need_reserved_buffers;
106 106
107 unsigned minimum_buffers;
108
107 struct hlist_head *cache_hash; 109 struct hlist_head *cache_hash;
108 wait_queue_head_t free_buffer_wait; 110 wait_queue_head_t free_buffer_wait;
109 111
@@ -538,7 +540,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
538 bio_init(&b->bio); 540 bio_init(&b->bio);
539 b->bio.bi_io_vec = b->bio_vec; 541 b->bio.bi_io_vec = b->bio_vec;
540 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 542 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
541 b->bio.bi_sector = block << b->c->sectors_per_block_bits; 543 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
542 b->bio.bi_bdev = b->c->bdev; 544 b->bio.bi_bdev = b->c->bdev;
543 b->bio.bi_end_io = end_io; 545 b->bio.bi_end_io = end_io;
544 546
@@ -861,8 +863,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
861 buffers = dm_bufio_cache_size_per_client >> 863 buffers = dm_bufio_cache_size_per_client >>
862 (c->sectors_per_block_bits + SECTOR_SHIFT); 864 (c->sectors_per_block_bits + SECTOR_SHIFT);
863 865
864 if (buffers < DM_BUFIO_MIN_BUFFERS) 866 if (buffers < c->minimum_buffers)
865 buffers = DM_BUFIO_MIN_BUFFERS; 867 buffers = c->minimum_buffers;
866 868
867 *limit_buffers = buffers; 869 *limit_buffers = buffers;
868 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 870 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
@@ -1350,6 +1352,34 @@ retry:
1350} 1352}
1351EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1353EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1352 1354
1355/*
1356 * Free the given buffer.
1357 *
1358 * This is just a hint, if the buffer is in use or dirty, this function
1359 * does nothing.
1360 */
1361void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1362{
1363 struct dm_buffer *b;
1364
1365 dm_bufio_lock(c);
1366
1367 b = __find(c, block);
1368 if (b && likely(!b->hold_count) && likely(!b->state)) {
1369 __unlink_buffer(b);
1370 __free_buffer_wake(b);
1371 }
1372
1373 dm_bufio_unlock(c);
1374}
1375EXPORT_SYMBOL(dm_bufio_forget);
1376
1377void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1378{
1379 c->minimum_buffers = n;
1380}
1381EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
1382
1353unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1383unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1354{ 1384{
1355 return c->block_size; 1385 return c->block_size;
@@ -1546,6 +1576,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
1546 INIT_LIST_HEAD(&c->reserved_buffers); 1576 INIT_LIST_HEAD(&c->reserved_buffers);
1547 c->need_reserved_buffers = reserved_buffers; 1577 c->need_reserved_buffers = reserved_buffers;
1548 1578
1579 c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
1580
1549 init_waitqueue_head(&c->free_buffer_wait); 1581 init_waitqueue_head(&c->free_buffer_wait);
1550 c->async_write_error = 0; 1582 c->async_write_error = 0;
1551 1583
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index b142946a9e32..c096779a7292 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -108,6 +108,18 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c);
108 */ 108 */
109void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); 109void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
110 110
111/*
112 * Free the given buffer.
113 * This is just a hint, if the buffer is in use or dirty, this function
114 * does nothing.
115 */
116void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
117
118/*
119 * Set the minimum number of buffers before cleanup happens.
120 */
121void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n);
122
111unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); 123unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
112sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); 124sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
113sector_t dm_bufio_get_block_number(struct dm_buffer *b); 125sector_t dm_bufio_get_block_number(struct dm_buffer *b);
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c
new file mode 100644
index 000000000000..6c9049c51b2b
--- /dev/null
+++ b/drivers/md/dm-builtin.c
@@ -0,0 +1,48 @@
1#include "dm.h"
2
3/*
4 * The kobject release method must not be placed in the module itself,
5 * otherwise we are subject to module unload races.
6 *
7 * The release method is called when the last reference to the kobject is
8 * dropped. It may be called by any other kernel code that drops the last
9 * reference.
10 *
11 * The release method suffers from module unload race. We may prevent the
12 * module from being unloaded at the start of the release method (using
13 * increased module reference count or synchronizing against the release
14 * method), however there is no way to prevent the module from being
15 * unloaded at the end of the release method.
16 *
17 * If this code were placed in the dm module, the following race may
18 * happen:
19 * 1. Some other process takes a reference to dm kobject
20 * 2. The user issues ioctl function to unload the dm device
21 * 3. dm_sysfs_exit calls kobject_put, however the object is not released
22 * because of the other reference taken at step 1
23 * 4. dm_sysfs_exit waits on the completion
24 * 5. The other process that took the reference in step 1 drops it,
25 * dm_kobject_release is called from this process
26 * 6. dm_kobject_release calls complete()
27 * 7. a reschedule happens before dm_kobject_release returns
28 * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference
29 * count is decremented
30 * 9. The user unloads the dm module
31 * 10. The other process that was rescheduled in step 7 continues to run,
32 * it is now executing code in unloaded module, so it crashes
33 *
34 * Note that if the process that takes the foreign reference to dm kobject
35 * has a low priority and the system is sufficiently loaded with
36 * higher-priority processes that prevent the low-priority process from
37 * being scheduled long enough, this bug may really happen.
38 *
39 * In order to fix this module unload race, we place the release method
40 * into a helper code that is compiled directly into the kernel.
41 */
42
43void dm_kobject_release(struct kobject *kobj)
44{
45 complete(dm_get_completion_from_kobject(kobj));
46}
47
48EXPORT_SYMBOL(dm_kobject_release);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 64780ad73bb0..0e385e40909e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -72,7 +72,7 @@ static enum io_pattern iot_pattern(struct io_tracker *t)
72 72
73static void iot_update_stats(struct io_tracker *t, struct bio *bio) 73static void iot_update_stats(struct io_tracker *t, struct bio *bio)
74{ 74{
75 if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1) 75 if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
76 t->nr_seq_samples++; 76 t->nr_seq_samples++;
77 else { 77 else {
78 /* 78 /*
@@ -87,7 +87,7 @@ static void iot_update_stats(struct io_tracker *t, struct bio *bio)
87 t->nr_rand_samples++; 87 t->nr_rand_samples++;
88 } 88 }
89 89
90 t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1); 90 t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
91} 91}
92 92
93static void iot_check_for_pattern_switch(struct io_tracker *t) 93static void iot_check_for_pattern_switch(struct io_tracker *t)
@@ -287,9 +287,8 @@ static struct entry *alloc_entry(struct entry_pool *ep)
287static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) 287static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
288{ 288{
289 struct entry *e = ep->entries + from_cblock(cblock); 289 struct entry *e = ep->entries + from_cblock(cblock);
290 list_del(&e->list);
291 290
292 INIT_LIST_HEAD(&e->list); 291 list_del_init(&e->list);
293 INIT_HLIST_NODE(&e->hlist); 292 INIT_HLIST_NODE(&e->hlist);
294 ep->nr_allocated++; 293 ep->nr_allocated++;
295 294
@@ -391,6 +390,10 @@ struct mq_policy {
391 */ 390 */
392 unsigned promote_threshold; 391 unsigned promote_threshold;
393 392
393 unsigned discard_promote_adjustment;
394 unsigned read_promote_adjustment;
395 unsigned write_promote_adjustment;
396
394 /* 397 /*
395 * The hash table allows us to quickly find an entry by origin 398 * The hash table allows us to quickly find an entry by origin
396 * block. Both pre_cache and cache entries are in here. 399 * block. Both pre_cache and cache entries are in here.
@@ -400,6 +403,10 @@ struct mq_policy {
400 struct hlist_head *table; 403 struct hlist_head *table;
401}; 404};
402 405
406#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
407#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
408#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
409
403/*----------------------------------------------------------------*/ 410/*----------------------------------------------------------------*/
404 411
405/* 412/*
@@ -642,25 +649,21 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
642 * We bias towards reads, since they can be demoted at no cost if they 649 * We bias towards reads, since they can be demoted at no cost if they
643 * haven't been dirtied. 650 * haven't been dirtied.
644 */ 651 */
645#define DISCARDED_PROMOTE_THRESHOLD 1
646#define READ_PROMOTE_THRESHOLD 4
647#define WRITE_PROMOTE_THRESHOLD 8
648
649static unsigned adjusted_promote_threshold(struct mq_policy *mq, 652static unsigned adjusted_promote_threshold(struct mq_policy *mq,
650 bool discarded_oblock, int data_dir) 653 bool discarded_oblock, int data_dir)
651{ 654{
652 if (data_dir == READ) 655 if (data_dir == READ)
653 return mq->promote_threshold + READ_PROMOTE_THRESHOLD; 656 return mq->promote_threshold + mq->read_promote_adjustment;
654 657
655 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { 658 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
656 /* 659 /*
657 * We don't need to do any copying at all, so give this a 660 * We don't need to do any copying at all, so give this a
658 * very low threshold. 661 * very low threshold.
659 */ 662 */
660 return DISCARDED_PROMOTE_THRESHOLD; 663 return mq->discard_promote_adjustment;
661 } 664 }
662 665
663 return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD; 666 return mq->promote_threshold + mq->write_promote_adjustment;
664} 667}
665 668
666static bool should_promote(struct mq_policy *mq, struct entry *e, 669static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -809,7 +812,7 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
809 bool can_migrate, bool discarded_oblock, 812 bool can_migrate, bool discarded_oblock,
810 int data_dir, struct policy_result *result) 813 int data_dir, struct policy_result *result)
811{ 814{
812 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) { 815 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
813 if (can_migrate) 816 if (can_migrate)
814 insert_in_cache(mq, oblock, result); 817 insert_in_cache(mq, oblock, result);
815 else 818 else
@@ -869,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)
869{ 872{
870 struct mq_policy *mq = to_mq_policy(p); 873 struct mq_policy *mq = to_mq_policy(p);
871 874
872 kfree(mq->table); 875 vfree(mq->table);
873 epool_exit(&mq->cache_pool); 876 epool_exit(&mq->cache_pool);
874 epool_exit(&mq->pre_cache_pool); 877 epool_exit(&mq->pre_cache_pool);
875 kfree(mq); 878 kfree(mq);
@@ -1135,20 +1138,28 @@ static int mq_set_config_value(struct dm_cache_policy *p,
1135 const char *key, const char *value) 1138 const char *key, const char *value)
1136{ 1139{
1137 struct mq_policy *mq = to_mq_policy(p); 1140 struct mq_policy *mq = to_mq_policy(p);
1138 enum io_pattern pattern;
1139 unsigned long tmp; 1141 unsigned long tmp;
1140 1142
1141 if (!strcasecmp(key, "random_threshold"))
1142 pattern = PATTERN_RANDOM;
1143 else if (!strcasecmp(key, "sequential_threshold"))
1144 pattern = PATTERN_SEQUENTIAL;
1145 else
1146 return -EINVAL;
1147
1148 if (kstrtoul(value, 10, &tmp)) 1143 if (kstrtoul(value, 10, &tmp))
1149 return -EINVAL; 1144 return -EINVAL;
1150 1145
1151 mq->tracker.thresholds[pattern] = tmp; 1146 if (!strcasecmp(key, "random_threshold")) {
1147 mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
1148
1149 } else if (!strcasecmp(key, "sequential_threshold")) {
1150 mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
1151
1152 } else if (!strcasecmp(key, "discard_promote_adjustment"))
1153 mq->discard_promote_adjustment = tmp;
1154
1155 else if (!strcasecmp(key, "read_promote_adjustment"))
1156 mq->read_promote_adjustment = tmp;
1157
1158 else if (!strcasecmp(key, "write_promote_adjustment"))
1159 mq->write_promote_adjustment = tmp;
1160
1161 else
1162 return -EINVAL;
1152 1163
1153 return 0; 1164 return 0;
1154} 1165}
@@ -1158,9 +1169,16 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsign
1158 ssize_t sz = 0; 1169 ssize_t sz = 0;
1159 struct mq_policy *mq = to_mq_policy(p); 1170 struct mq_policy *mq = to_mq_policy(p);
1160 1171
1161 DMEMIT("4 random_threshold %u sequential_threshold %u", 1172 DMEMIT("10 random_threshold %u "
1173 "sequential_threshold %u "
1174 "discard_promote_adjustment %u "
1175 "read_promote_adjustment %u "
1176 "write_promote_adjustment %u",
1162 mq->tracker.thresholds[PATTERN_RANDOM], 1177 mq->tracker.thresholds[PATTERN_RANDOM],
1163 mq->tracker.thresholds[PATTERN_SEQUENTIAL]); 1178 mq->tracker.thresholds[PATTERN_SEQUENTIAL],
1179 mq->discard_promote_adjustment,
1180 mq->read_promote_adjustment,
1181 mq->write_promote_adjustment);
1164 1182
1165 return 0; 1183 return 0;
1166} 1184}
@@ -1213,6 +1231,9 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1213 mq->hit_count = 0; 1231 mq->hit_count = 0;
1214 mq->generation = 0; 1232 mq->generation = 0;
1215 mq->promote_threshold = 0; 1233 mq->promote_threshold = 0;
1234 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
1235 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
1236 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
1216 mutex_init(&mq->lock); 1237 mutex_init(&mq->lock);
1217 spin_lock_init(&mq->tick_lock); 1238 spin_lock_init(&mq->tick_lock);
1218 1239
@@ -1224,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1224 1245
1225 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); 1246 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1226 mq->hash_bits = ffs(mq->nr_buckets) - 1; 1247 mq->hash_bits = ffs(mq->nr_buckets) - 1;
1227 mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); 1248 mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
1228 if (!mq->table) 1249 if (!mq->table)
1229 goto bad_alloc_table; 1250 goto bad_alloc_table;
1230 1251
@@ -1244,7 +1265,7 @@ bad_pre_cache_init:
1244 1265
1245static struct dm_cache_policy_type mq_policy_type = { 1266static struct dm_cache_policy_type mq_policy_type = {
1246 .name = "mq", 1267 .name = "mq",
1247 .version = {1, 1, 0}, 1268 .version = {1, 2, 0},
1248 .hint_size = 4, 1269 .hint_size = 4,
1249 .owner = THIS_MODULE, 1270 .owner = THIS_MODULE,
1250 .create = mq_create 1271 .create = mq_create
@@ -1252,10 +1273,11 @@ static struct dm_cache_policy_type mq_policy_type = {
1252 1273
1253static struct dm_cache_policy_type default_policy_type = { 1274static struct dm_cache_policy_type default_policy_type = {
1254 .name = "default", 1275 .name = "default",
1255 .version = {1, 1, 0}, 1276 .version = {1, 2, 0},
1256 .hint_size = 4, 1277 .hint_size = 4,
1257 .owner = THIS_MODULE, 1278 .owner = THIS_MODULE,
1258 .create = mq_create 1279 .create = mq_create,
1280 .real = &mq_policy_type
1259}; 1281};
1260 1282
1261static int __init mq_init(void) 1283static int __init mq_init(void)
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index d80057968407..c1a3cee99b44 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -146,6 +146,10 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
146{ 146{
147 struct dm_cache_policy_type *t = p->private; 147 struct dm_cache_policy_type *t = p->private;
148 148
149 /* if t->real is set then an alias was used (e.g. "default") */
150 if (t->real)
151 return t->real->name;
152
149 return t->name; 153 return t->name;
150} 154}
151EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); 155EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 052c00a84a5c..f50fe360c546 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -223,6 +223,12 @@ struct dm_cache_policy_type {
223 unsigned version[CACHE_POLICY_VERSION_SIZE]; 223 unsigned version[CACHE_POLICY_VERSION_SIZE];
224 224
225 /* 225 /*
226 * For use by an alias dm_cache_policy_type to point to the
227 * real dm_cache_policy_type.
228 */
229 struct dm_cache_policy_type *real;
230
231 /*
226 * Policies may store a hint for each each cache block. 232 * Policies may store a hint for each each cache block.
227 * Currently the size of this hint must be 0 or 4 bytes but we 233 * Currently the size of this hint must be 0 or 4 bytes but we
228 * expect to relax this in future. 234 * expect to relax this in future.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1b1469ebe5cb..074b9c8e4cf0 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -85,6 +85,12 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
85{ 85{
86 bio->bi_end_io = h->bi_end_io; 86 bio->bi_end_io = h->bi_end_io;
87 bio->bi_private = h->bi_private; 87 bio->bi_private = h->bi_private;
88
89 /*
90 * Must bump bi_remaining to allow bio to complete with
91 * restored bi_end_io.
92 */
93 atomic_inc(&bio->bi_remaining);
88} 94}
89 95
90/*----------------------------------------------------------------*/ 96/*----------------------------------------------------------------*/
@@ -283,6 +289,7 @@ struct per_bio_data {
283 bool tick:1; 289 bool tick:1;
284 unsigned req_nr:2; 290 unsigned req_nr:2;
285 struct dm_deferred_entry *all_io_entry; 291 struct dm_deferred_entry *all_io_entry;
292 struct dm_hook_info hook_info;
286 293
287 /* 294 /*
288 * writethrough fields. These MUST remain at the end of this 295 * writethrough fields. These MUST remain at the end of this
@@ -291,7 +298,6 @@ struct per_bio_data {
291 */ 298 */
292 struct cache *cache; 299 struct cache *cache;
293 dm_cblock_t cblock; 300 dm_cblock_t cblock;
294 struct dm_hook_info hook_info;
295 struct dm_bio_details bio_details; 301 struct dm_bio_details bio_details;
296}; 302};
297 303
@@ -664,15 +670,18 @@ static void remap_to_origin(struct cache *cache, struct bio *bio)
664static void remap_to_cache(struct cache *cache, struct bio *bio, 670static void remap_to_cache(struct cache *cache, struct bio *bio,
665 dm_cblock_t cblock) 671 dm_cblock_t cblock)
666{ 672{
667 sector_t bi_sector = bio->bi_sector; 673 sector_t bi_sector = bio->bi_iter.bi_sector;
674 sector_t block = from_cblock(cblock);
668 675
669 bio->bi_bdev = cache->cache_dev->bdev; 676 bio->bi_bdev = cache->cache_dev->bdev;
670 if (!block_size_is_power_of_two(cache)) 677 if (!block_size_is_power_of_two(cache))
671 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + 678 bio->bi_iter.bi_sector =
672 sector_div(bi_sector, cache->sectors_per_block); 679 (block * cache->sectors_per_block) +
680 sector_div(bi_sector, cache->sectors_per_block);
673 else 681 else
674 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | 682 bio->bi_iter.bi_sector =
675 (bi_sector & (cache->sectors_per_block - 1)); 683 (block << cache->sectors_per_block_shift) |
684 (bi_sector & (cache->sectors_per_block - 1));
676} 685}
677 686
678static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 687static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
@@ -712,7 +721,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
712 721
713static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 722static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
714{ 723{
715 sector_t block_nr = bio->bi_sector; 724 sector_t block_nr = bio->bi_iter.bi_sector;
716 725
717 if (!block_size_is_power_of_two(cache)) 726 if (!block_size_is_power_of_two(cache))
718 (void) sector_div(block_nr, cache->sectors_per_block); 727 (void) sector_div(block_nr, cache->sectors_per_block);
@@ -970,12 +979,13 @@ static void issue_copy_real(struct dm_cache_migration *mg)
970 int r; 979 int r;
971 struct dm_io_region o_region, c_region; 980 struct dm_io_region o_region, c_region;
972 struct cache *cache = mg->cache; 981 struct cache *cache = mg->cache;
982 sector_t cblock = from_cblock(mg->cblock);
973 983
974 o_region.bdev = cache->origin_dev->bdev; 984 o_region.bdev = cache->origin_dev->bdev;
975 o_region.count = cache->sectors_per_block; 985 o_region.count = cache->sectors_per_block;
976 986
977 c_region.bdev = cache->cache_dev->bdev; 987 c_region.bdev = cache->cache_dev->bdev;
978 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; 988 c_region.sector = cblock * cache->sectors_per_block;
979 c_region.count = cache->sectors_per_block; 989 c_region.count = cache->sectors_per_block;
980 990
981 if (mg->writeback || mg->demote) { 991 if (mg->writeback || mg->demote) {
@@ -1002,13 +1012,15 @@ static void overwrite_endio(struct bio *bio, int err)
1002 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1012 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1003 unsigned long flags; 1013 unsigned long flags;
1004 1014
1015 dm_unhook_bio(&pb->hook_info, bio);
1016
1005 if (err) 1017 if (err)
1006 mg->err = true; 1018 mg->err = true;
1007 1019
1020 mg->requeue_holder = false;
1021
1008 spin_lock_irqsave(&cache->lock, flags); 1022 spin_lock_irqsave(&cache->lock, flags);
1009 list_add_tail(&mg->list, &cache->completed_migrations); 1023 list_add_tail(&mg->list, &cache->completed_migrations);
1010 dm_unhook_bio(&pb->hook_info, bio);
1011 mg->requeue_holder = false;
1012 spin_unlock_irqrestore(&cache->lock, flags); 1024 spin_unlock_irqrestore(&cache->lock, flags);
1013 1025
1014 wake_worker(cache); 1026 wake_worker(cache);
@@ -1027,7 +1039,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1027static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1039static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1028{ 1040{
1029 return (bio_data_dir(bio) == WRITE) && 1041 return (bio_data_dir(bio) == WRITE) &&
1030 (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1042 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1031} 1043}
1032 1044
1033static void avoid_copy(struct dm_cache_migration *mg) 1045static void avoid_copy(struct dm_cache_migration *mg)
@@ -1252,7 +1264,7 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
1252 size_t pb_data_size = get_per_bio_data_size(cache); 1264 size_t pb_data_size = get_per_bio_data_size(cache);
1253 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1265 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1254 1266
1255 BUG_ON(bio->bi_size); 1267 BUG_ON(bio->bi_iter.bi_size);
1256 if (!pb->req_nr) 1268 if (!pb->req_nr)
1257 remap_to_origin(cache, bio); 1269 remap_to_origin(cache, bio);
1258 else 1270 else
@@ -1275,9 +1287,9 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
1275 */ 1287 */
1276static void process_discard_bio(struct cache *cache, struct bio *bio) 1288static void process_discard_bio(struct cache *cache, struct bio *bio)
1277{ 1289{
1278 dm_block_t start_block = dm_sector_div_up(bio->bi_sector, 1290 dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
1279 cache->discard_block_size); 1291 cache->discard_block_size);
1280 dm_block_t end_block = bio->bi_sector + bio_sectors(bio); 1292 dm_block_t end_block = bio_end_sector(bio);
1281 dm_block_t b; 1293 dm_block_t b;
1282 1294
1283 end_block = block_div(end_block, cache->discard_block_size); 1295 end_block = block_div(end_block, cache->discard_block_size);
@@ -2453,20 +2465,18 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2453 bool discarded_block; 2465 bool discarded_block;
2454 struct dm_bio_prison_cell *cell; 2466 struct dm_bio_prison_cell *cell;
2455 struct policy_result lookup_result; 2467 struct policy_result lookup_result;
2456 struct per_bio_data *pb; 2468 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2457 2469
2458 if (from_oblock(block) > from_oblock(cache->origin_blocks)) { 2470 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2459 /* 2471 /*
2460 * This can only occur if the io goes to a partial block at 2472 * This can only occur if the io goes to a partial block at
2461 * the end of the origin device. We don't cache these. 2473 * the end of the origin device. We don't cache these.
2462 * Just remap to the origin and carry on. 2474 * Just remap to the origin and carry on.
2463 */ 2475 */
2464 remap_to_origin_clear_discard(cache, bio, block); 2476 remap_to_origin(cache, bio);
2465 return DM_MAPIO_REMAPPED; 2477 return DM_MAPIO_REMAPPED;
2466 } 2478 }
2467 2479
2468 pb = init_per_bio_data(bio, pb_data_size);
2469
2470 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2480 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2471 defer_bio(cache, bio); 2481 defer_bio(cache, bio);
2472 return DM_MAPIO_SUBMITTED; 2482 return DM_MAPIO_SUBMITTED;
@@ -2826,12 +2836,13 @@ static void cache_resume(struct dm_target *ti)
2826/* 2836/*
2827 * Status format: 2837 * Status format:
2828 * 2838 *
2829 * <#used metadata blocks>/<#total metadata blocks> 2839 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
2840 * <cache block size> <#used cache blocks>/<#total cache blocks>
2830 * <#read hits> <#read misses> <#write hits> <#write misses> 2841 * <#read hits> <#read misses> <#write hits> <#write misses>
2831 * <#demotions> <#promotions> <#blocks in cache> <#dirty> 2842 * <#demotions> <#promotions> <#dirty>
2832 * <#features> <features>* 2843 * <#features> <features>*
2833 * <#core args> <core args> 2844 * <#core args> <core args>
2834 * <#policy args> <policy args>* 2845 * <policy name> <#policy args> <policy args>*
2835 */ 2846 */
2836static void cache_status(struct dm_target *ti, status_type_t type, 2847static void cache_status(struct dm_target *ti, status_type_t type,
2837 unsigned status_flags, char *result, unsigned maxlen) 2848 unsigned status_flags, char *result, unsigned maxlen)
@@ -2869,17 +2880,20 @@ static void cache_status(struct dm_target *ti, status_type_t type,
2869 2880
2870 residency = policy_residency(cache->policy); 2881 residency = policy_residency(cache->policy);
2871 2882
2872 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", 2883 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ",
2884 (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
2873 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2885 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2874 (unsigned long long)nr_blocks_metadata, 2886 (unsigned long long)nr_blocks_metadata,
2887 cache->sectors_per_block,
2888 (unsigned long long) from_cblock(residency),
2889 (unsigned long long) from_cblock(cache->cache_size),
2875 (unsigned) atomic_read(&cache->stats.read_hit), 2890 (unsigned) atomic_read(&cache->stats.read_hit),
2876 (unsigned) atomic_read(&cache->stats.read_miss), 2891 (unsigned) atomic_read(&cache->stats.read_miss),
2877 (unsigned) atomic_read(&cache->stats.write_hit), 2892 (unsigned) atomic_read(&cache->stats.write_hit),
2878 (unsigned) atomic_read(&cache->stats.write_miss), 2893 (unsigned) atomic_read(&cache->stats.write_miss),
2879 (unsigned) atomic_read(&cache->stats.demotion), 2894 (unsigned) atomic_read(&cache->stats.demotion),
2880 (unsigned) atomic_read(&cache->stats.promotion), 2895 (unsigned) atomic_read(&cache->stats.promotion),
2881 (unsigned long long) from_cblock(residency), 2896 (unsigned long long) from_cblock(cache->nr_dirty));
2882 cache->nr_dirty);
2883 2897
2884 if (writethrough_mode(&cache->features)) 2898 if (writethrough_mode(&cache->features))
2885 DMEMIT("1 writethrough "); 2899 DMEMIT("1 writethrough ");
@@ -2896,6 +2910,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
2896 } 2910 }
2897 2911
2898 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 2912 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2913
2914 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
2899 if (sz < maxlen) { 2915 if (sz < maxlen) {
2900 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 2916 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2901 if (r) 2917 if (r)
@@ -3129,7 +3145,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3129 3145
3130static struct target_type cache_target = { 3146static struct target_type cache_target = {
3131 .name = "cache", 3147 .name = "cache",
3132 .version = {1, 2, 0}, 3148 .version = {1, 3, 0},
3133 .module = THIS_MODULE, 3149 .module = THIS_MODULE,
3134 .ctr = cache_ctr, 3150 .ctr = cache_ctr,
3135 .dtr = cache_dtr, 3151 .dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 81b0fa660452..784695d22fde 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -39,10 +39,8 @@ struct convert_context {
39 struct completion restart; 39 struct completion restart;
40 struct bio *bio_in; 40 struct bio *bio_in;
41 struct bio *bio_out; 41 struct bio *bio_out;
42 unsigned int offset_in; 42 struct bvec_iter iter_in;
43 unsigned int offset_out; 43 struct bvec_iter iter_out;
44 unsigned int idx_in;
45 unsigned int idx_out;
46 sector_t cc_sector; 44 sector_t cc_sector;
47 atomic_t cc_pending; 45 atomic_t cc_pending;
48}; 46};
@@ -826,10 +824,10 @@ static void crypt_convert_init(struct crypt_config *cc,
826{ 824{
827 ctx->bio_in = bio_in; 825 ctx->bio_in = bio_in;
828 ctx->bio_out = bio_out; 826 ctx->bio_out = bio_out;
829 ctx->offset_in = 0; 827 if (bio_in)
830 ctx->offset_out = 0; 828 ctx->iter_in = bio_in->bi_iter;
831 ctx->idx_in = bio_in ? bio_in->bi_idx : 0; 829 if (bio_out)
832 ctx->idx_out = bio_out ? bio_out->bi_idx : 0; 830 ctx->iter_out = bio_out->bi_iter;
833 ctx->cc_sector = sector + cc->iv_offset; 831 ctx->cc_sector = sector + cc->iv_offset;
834 init_completion(&ctx->restart); 832 init_completion(&ctx->restart);
835} 833}
@@ -857,8 +855,8 @@ static int crypt_convert_block(struct crypt_config *cc,
857 struct convert_context *ctx, 855 struct convert_context *ctx,
858 struct ablkcipher_request *req) 856 struct ablkcipher_request *req)
859{ 857{
860 struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); 858 struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
861 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); 859 struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
862 struct dm_crypt_request *dmreq; 860 struct dm_crypt_request *dmreq;
863 u8 *iv; 861 u8 *iv;
864 int r; 862 int r;
@@ -869,24 +867,15 @@ static int crypt_convert_block(struct crypt_config *cc,
869 dmreq->iv_sector = ctx->cc_sector; 867 dmreq->iv_sector = ctx->cc_sector;
870 dmreq->ctx = ctx; 868 dmreq->ctx = ctx;
871 sg_init_table(&dmreq->sg_in, 1); 869 sg_init_table(&dmreq->sg_in, 1);
872 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 870 sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
873 bv_in->bv_offset + ctx->offset_in); 871 bv_in.bv_offset);
874 872
875 sg_init_table(&dmreq->sg_out, 1); 873 sg_init_table(&dmreq->sg_out, 1);
876 sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, 874 sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
877 bv_out->bv_offset + ctx->offset_out); 875 bv_out.bv_offset);
878 876
879 ctx->offset_in += 1 << SECTOR_SHIFT; 877 bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
880 if (ctx->offset_in >= bv_in->bv_len) { 878 bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
881 ctx->offset_in = 0;
882 ctx->idx_in++;
883 }
884
885 ctx->offset_out += 1 << SECTOR_SHIFT;
886 if (ctx->offset_out >= bv_out->bv_len) {
887 ctx->offset_out = 0;
888 ctx->idx_out++;
889 }
890 879
891 if (cc->iv_gen_ops) { 880 if (cc->iv_gen_ops) {
892 r = cc->iv_gen_ops->generator(cc, iv, dmreq); 881 r = cc->iv_gen_ops->generator(cc, iv, dmreq);
@@ -937,8 +926,7 @@ static int crypt_convert(struct crypt_config *cc,
937 926
938 atomic_set(&ctx->cc_pending, 1); 927 atomic_set(&ctx->cc_pending, 1);
939 928
940 while(ctx->idx_in < ctx->bio_in->bi_vcnt && 929 while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
941 ctx->idx_out < ctx->bio_out->bi_vcnt) {
942 930
943 crypt_alloc_req(cc, ctx); 931 crypt_alloc_req(cc, ctx);
944 932
@@ -1021,7 +1009,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
1021 size -= len; 1009 size -= len;
1022 } 1010 }
1023 1011
1024 if (!clone->bi_size) { 1012 if (!clone->bi_iter.bi_size) {
1025 bio_put(clone); 1013 bio_put(clone);
1026 return NULL; 1014 return NULL;
1027 } 1015 }
@@ -1161,7 +1149,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
1161 crypt_inc_pending(io); 1149 crypt_inc_pending(io);
1162 1150
1163 clone_init(io, clone); 1151 clone_init(io, clone);
1164 clone->bi_sector = cc->start + io->sector; 1152 clone->bi_iter.bi_sector = cc->start + io->sector;
1165 1153
1166 generic_make_request(clone); 1154 generic_make_request(clone);
1167 return 0; 1155 return 0;
@@ -1207,9 +1195,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1207 } 1195 }
1208 1196
1209 /* crypt_convert should have filled the clone bio */ 1197 /* crypt_convert should have filled the clone bio */
1210 BUG_ON(io->ctx.idx_out < clone->bi_vcnt); 1198 BUG_ON(io->ctx.iter_out.bi_size);
1211 1199
1212 clone->bi_sector = cc->start + io->sector; 1200 clone->bi_iter.bi_sector = cc->start + io->sector;
1213 1201
1214 if (async) 1202 if (async)
1215 kcryptd_queue_io(io); 1203 kcryptd_queue_io(io);
@@ -1224,7 +1212,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1224 struct dm_crypt_io *new_io; 1212 struct dm_crypt_io *new_io;
1225 int crypt_finished; 1213 int crypt_finished;
1226 unsigned out_of_pages = 0; 1214 unsigned out_of_pages = 0;
1227 unsigned remaining = io->base_bio->bi_size; 1215 unsigned remaining = io->base_bio->bi_iter.bi_size;
1228 sector_t sector = io->sector; 1216 sector_t sector = io->sector;
1229 int r; 1217 int r;
1230 1218
@@ -1246,9 +1234,9 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1246 } 1234 }
1247 1235
1248 io->ctx.bio_out = clone; 1236 io->ctx.bio_out = clone;
1249 io->ctx.idx_out = 0; 1237 io->ctx.iter_out = clone->bi_iter;
1250 1238
1251 remaining -= clone->bi_size; 1239 remaining -= clone->bi_iter.bi_size;
1252 sector += bio_sectors(clone); 1240 sector += bio_sectors(clone);
1253 1241
1254 crypt_inc_pending(io); 1242 crypt_inc_pending(io);
@@ -1290,8 +1278,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1290 crypt_inc_pending(new_io); 1278 crypt_inc_pending(new_io);
1291 crypt_convert_init(cc, &new_io->ctx, NULL, 1279 crypt_convert_init(cc, &new_io->ctx, NULL,
1292 io->base_bio, sector); 1280 io->base_bio, sector);
1293 new_io->ctx.idx_in = io->ctx.idx_in; 1281 new_io->ctx.iter_in = io->ctx.iter_in;
1294 new_io->ctx.offset_in = io->ctx.offset_in;
1295 1282
1296 /* 1283 /*
1297 * Fragments after the first use the base_io 1284 * Fragments after the first use the base_io
@@ -1869,11 +1856,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
1869 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { 1856 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
1870 bio->bi_bdev = cc->dev->bdev; 1857 bio->bi_bdev = cc->dev->bdev;
1871 if (bio_sectors(bio)) 1858 if (bio_sectors(bio))
1872 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); 1859 bio->bi_iter.bi_sector = cc->start +
1860 dm_target_offset(ti, bio->bi_iter.bi_sector);
1873 return DM_MAPIO_REMAPPED; 1861 return DM_MAPIO_REMAPPED;
1874 } 1862 }
1875 1863
1876 io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector)); 1864 io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
1877 1865
1878 if (bio_data_dir(io->base_bio) == READ) { 1866 if (bio_data_dir(io->base_bio) == READ) {
1879 if (kcryptd_io_read(io, GFP_NOWAIT)) 1867 if (kcryptd_io_read(io, GFP_NOWAIT))
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2f91d6d4a2cc..42c3a27a14cc 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -24,7 +24,6 @@ struct delay_c {
24 struct work_struct flush_expired_bios; 24 struct work_struct flush_expired_bios;
25 struct list_head delayed_bios; 25 struct list_head delayed_bios;
26 atomic_t may_delay; 26 atomic_t may_delay;
27 mempool_t *delayed_pool;
28 27
29 struct dm_dev *dev_read; 28 struct dm_dev *dev_read;
30 sector_t start_read; 29 sector_t start_read;
@@ -40,14 +39,11 @@ struct delay_c {
40struct dm_delay_info { 39struct dm_delay_info {
41 struct delay_c *context; 40 struct delay_c *context;
42 struct list_head list; 41 struct list_head list;
43 struct bio *bio;
44 unsigned long expires; 42 unsigned long expires;
45}; 43};
46 44
47static DEFINE_MUTEX(delayed_bios_lock); 45static DEFINE_MUTEX(delayed_bios_lock);
48 46
49static struct kmem_cache *delayed_cache;
50
51static void handle_delayed_timer(unsigned long data) 47static void handle_delayed_timer(unsigned long data)
52{ 48{
53 struct delay_c *dc = (struct delay_c *)data; 49 struct delay_c *dc = (struct delay_c *)data;
@@ -87,13 +83,14 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
87 mutex_lock(&delayed_bios_lock); 83 mutex_lock(&delayed_bios_lock);
88 list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { 84 list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
89 if (flush_all || time_after_eq(jiffies, delayed->expires)) { 85 if (flush_all || time_after_eq(jiffies, delayed->expires)) {
86 struct bio *bio = dm_bio_from_per_bio_data(delayed,
87 sizeof(struct dm_delay_info));
90 list_del(&delayed->list); 88 list_del(&delayed->list);
91 bio_list_add(&flush_bios, delayed->bio); 89 bio_list_add(&flush_bios, bio);
92 if ((bio_data_dir(delayed->bio) == WRITE)) 90 if ((bio_data_dir(bio) == WRITE))
93 delayed->context->writes--; 91 delayed->context->writes--;
94 else 92 else
95 delayed->context->reads--; 93 delayed->context->reads--;
96 mempool_free(delayed, dc->delayed_pool);
97 continue; 94 continue;
98 } 95 }
99 96
@@ -185,12 +182,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
185 } 182 }
186 183
187out: 184out:
188 dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
189 if (!dc->delayed_pool) {
190 DMERR("Couldn't create delayed bio pool.");
191 goto bad_dev_write;
192 }
193
194 dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); 185 dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
195 if (!dc->kdelayd_wq) { 186 if (!dc->kdelayd_wq) {
196 DMERR("Couldn't start kdelayd"); 187 DMERR("Couldn't start kdelayd");
@@ -206,12 +197,11 @@ out:
206 197
207 ti->num_flush_bios = 1; 198 ti->num_flush_bios = 1;
208 ti->num_discard_bios = 1; 199 ti->num_discard_bios = 1;
200 ti->per_bio_data_size = sizeof(struct dm_delay_info);
209 ti->private = dc; 201 ti->private = dc;
210 return 0; 202 return 0;
211 203
212bad_queue: 204bad_queue:
213 mempool_destroy(dc->delayed_pool);
214bad_dev_write:
215 if (dc->dev_write) 205 if (dc->dev_write)
216 dm_put_device(ti, dc->dev_write); 206 dm_put_device(ti, dc->dev_write);
217bad_dev_read: 207bad_dev_read:
@@ -232,7 +222,6 @@ static void delay_dtr(struct dm_target *ti)
232 if (dc->dev_write) 222 if (dc->dev_write)
233 dm_put_device(ti, dc->dev_write); 223 dm_put_device(ti, dc->dev_write);
234 224
235 mempool_destroy(dc->delayed_pool);
236 kfree(dc); 225 kfree(dc);
237} 226}
238 227
@@ -244,10 +233,9 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
244 if (!delay || !atomic_read(&dc->may_delay)) 233 if (!delay || !atomic_read(&dc->may_delay))
245 return 1; 234 return 1;
246 235
247 delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO); 236 delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
248 237
249 delayed->context = dc; 238 delayed->context = dc;
250 delayed->bio = bio;
251 delayed->expires = expires = jiffies + (delay * HZ / 1000); 239 delayed->expires = expires = jiffies + (delay * HZ / 1000);
252 240
253 mutex_lock(&delayed_bios_lock); 241 mutex_lock(&delayed_bios_lock);
@@ -289,14 +277,15 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
289 if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { 277 if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
290 bio->bi_bdev = dc->dev_write->bdev; 278 bio->bi_bdev = dc->dev_write->bdev;
291 if (bio_sectors(bio)) 279 if (bio_sectors(bio))
292 bio->bi_sector = dc->start_write + 280 bio->bi_iter.bi_sector = dc->start_write +
293 dm_target_offset(ti, bio->bi_sector); 281 dm_target_offset(ti, bio->bi_iter.bi_sector);
294 282
295 return delay_bio(dc, dc->write_delay, bio); 283 return delay_bio(dc, dc->write_delay, bio);
296 } 284 }
297 285
298 bio->bi_bdev = dc->dev_read->bdev; 286 bio->bi_bdev = dc->dev_read->bdev;
299 bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector); 287 bio->bi_iter.bi_sector = dc->start_read +
288 dm_target_offset(ti, bio->bi_iter.bi_sector);
300 289
301 return delay_bio(dc, dc->read_delay, bio); 290 return delay_bio(dc, dc->read_delay, bio);
302} 291}
@@ -356,13 +345,7 @@ static struct target_type delay_target = {
356 345
357static int __init dm_delay_init(void) 346static int __init dm_delay_init(void)
358{ 347{
359 int r = -ENOMEM; 348 int r;
360
361 delayed_cache = KMEM_CACHE(dm_delay_info, 0);
362 if (!delayed_cache) {
363 DMERR("Couldn't create delayed bio cache.");
364 goto bad_memcache;
365 }
366 349
367 r = dm_register_target(&delay_target); 350 r = dm_register_target(&delay_target);
368 if (r < 0) { 351 if (r < 0) {
@@ -373,15 +356,12 @@ static int __init dm_delay_init(void)
373 return 0; 356 return 0;
374 357
375bad_register: 358bad_register:
376 kmem_cache_destroy(delayed_cache);
377bad_memcache:
378 return r; 359 return r;
379} 360}
380 361
381static void __exit dm_delay_exit(void) 362static void __exit dm_delay_exit(void)
382{ 363{
383 dm_unregister_target(&delay_target); 364 dm_unregister_target(&delay_target);
384 kmem_cache_destroy(delayed_cache);
385} 365}
386 366
387/* Module hooks */ 367/* Module hooks */
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index c80a0ec5f126..b257e46876d3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -248,7 +248,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
248 248
249 bio->bi_bdev = fc->dev->bdev; 249 bio->bi_bdev = fc->dev->bdev;
250 if (bio_sectors(bio)) 250 if (bio_sectors(bio))
251 bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); 251 bio->bi_iter.bi_sector =
252 flakey_map_sector(ti, bio->bi_iter.bi_sector);
252} 253}
253 254
254static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) 255static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
@@ -265,8 +266,8 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
265 DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " 266 DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
266 "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", 267 "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
267 bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, 268 bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
268 (bio_data_dir(bio) == WRITE) ? 'w' : 'r', 269 (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw,
269 bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes); 270 (unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
270 } 271 }
271} 272}
272 273
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2a20986a2fec..3842ac738f98 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -201,26 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
201/* 201/*
202 * Functions for getting the pages from a bvec. 202 * Functions for getting the pages from a bvec.
203 */ 203 */
204static void bvec_get_page(struct dpages *dp, 204static void bio_get_page(struct dpages *dp, struct page **p,
205 struct page **p, unsigned long *len, unsigned *offset) 205 unsigned long *len, unsigned *offset)
206{ 206{
207 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; 207 struct bio_vec *bvec = dp->context_ptr;
208 *p = bvec->bv_page; 208 *p = bvec->bv_page;
209 *len = bvec->bv_len; 209 *len = bvec->bv_len - dp->context_u;
210 *offset = bvec->bv_offset; 210 *offset = bvec->bv_offset + dp->context_u;
211} 211}
212 212
213static void bvec_next_page(struct dpages *dp) 213static void bio_next_page(struct dpages *dp)
214{ 214{
215 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; 215 struct bio_vec *bvec = dp->context_ptr;
216 dp->context_ptr = bvec + 1; 216 dp->context_ptr = bvec + 1;
217 dp->context_u = 0;
217} 218}
218 219
219static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) 220static void bio_dp_init(struct dpages *dp, struct bio *bio)
220{ 221{
221 dp->get_page = bvec_get_page; 222 dp->get_page = bio_get_page;
222 dp->next_page = bvec_next_page; 223 dp->next_page = bio_next_page;
223 dp->context_ptr = bvec; 224 dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
225 dp->context_u = bio->bi_iter.bi_bvec_done;
224} 226}
225 227
226/* 228/*
@@ -304,14 +306,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
304 dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); 306 dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
305 307
306 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 308 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
307 bio->bi_sector = where->sector + (where->count - remaining); 309 bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
308 bio->bi_bdev = where->bdev; 310 bio->bi_bdev = where->bdev;
309 bio->bi_end_io = endio; 311 bio->bi_end_io = endio;
310 store_io_and_region_in_bio(bio, io, region); 312 store_io_and_region_in_bio(bio, io, region);
311 313
312 if (rw & REQ_DISCARD) { 314 if (rw & REQ_DISCARD) {
313 num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); 315 num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
314 bio->bi_size = num_sectors << SECTOR_SHIFT; 316 bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
315 remaining -= num_sectors; 317 remaining -= num_sectors;
316 } else if (rw & REQ_WRITE_SAME) { 318 } else if (rw & REQ_WRITE_SAME) {
317 /* 319 /*
@@ -320,7 +322,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
320 dp->get_page(dp, &page, &len, &offset); 322 dp->get_page(dp, &page, &len, &offset);
321 bio_add_page(bio, page, logical_block_size, offset); 323 bio_add_page(bio, page, logical_block_size, offset);
322 num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining); 324 num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
323 bio->bi_size = num_sectors << SECTOR_SHIFT; 325 bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
324 326
325 offset = 0; 327 offset = 0;
326 remaining -= num_sectors; 328 remaining -= num_sectors;
@@ -457,8 +459,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
457 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); 459 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
458 break; 460 break;
459 461
460 case DM_IO_BVEC: 462 case DM_IO_BIO:
461 bvec_dp_init(dp, io_req->mem.ptr.bvec); 463 bio_dp_init(dp, io_req->mem.ptr.bio);
462 break; 464 break;
463 465
464 case DM_IO_VMA: 466 case DM_IO_VMA:
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4f99d267340c..53e848c10939 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -85,7 +85,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
85 85
86 bio->bi_bdev = lc->dev->bdev; 86 bio->bi_bdev = lc->dev->bdev;
87 if (bio_sectors(bio)) 87 if (bio_sectors(bio))
88 bio->bi_sector = linear_map_sector(ti, bio->bi_sector); 88 bio->bi_iter.bi_sector =
89 linear_map_sector(ti, bio->bi_iter.bi_sector);
89} 90}
90 91
91static int linear_map(struct dm_target *ti, struct bio *bio) 92static int linear_map(struct dm_target *ti, struct bio *bio)
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 9429159d9ee3..b953db6cc229 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -10,10 +10,11 @@
10#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
11#include <linux/dm-log-userspace.h> 11#include <linux/dm-log-userspace.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/workqueue.h>
13 14
14#include "dm-log-userspace-transfer.h" 15#include "dm-log-userspace-transfer.h"
15 16
16#define DM_LOG_USERSPACE_VSN "1.1.0" 17#define DM_LOG_USERSPACE_VSN "1.3.0"
17 18
18struct flush_entry { 19struct flush_entry {
19 int type; 20 int type;
@@ -58,6 +59,18 @@ struct log_c {
58 spinlock_t flush_lock; 59 spinlock_t flush_lock;
59 struct list_head mark_list; 60 struct list_head mark_list;
60 struct list_head clear_list; 61 struct list_head clear_list;
62
63 /*
64 * Workqueue for flush of clear region requests.
65 */
66 struct workqueue_struct *dmlog_wq;
67 struct delayed_work flush_log_work;
68 atomic_t sched_flush;
69
70 /*
71 * Combine userspace flush and mark requests for efficiency.
72 */
73 uint32_t integrated_flush;
61}; 74};
62 75
63static mempool_t *flush_entry_pool; 76static mempool_t *flush_entry_pool;
@@ -122,6 +135,9 @@ static int build_constructor_string(struct dm_target *ti,
122 135
123 *ctr_str = NULL; 136 *ctr_str = NULL;
124 137
138 /*
139 * Determine overall size of the string.
140 */
125 for (i = 0, str_size = 0; i < argc; i++) 141 for (i = 0, str_size = 0; i < argc; i++)
126 str_size += strlen(argv[i]) + 1; /* +1 for space between args */ 142 str_size += strlen(argv[i]) + 1; /* +1 for space between args */
127 143
@@ -141,18 +157,39 @@ static int build_constructor_string(struct dm_target *ti,
141 return str_size; 157 return str_size;
142} 158}
143 159
160static void do_flush(struct work_struct *work)
161{
162 int r;
163 struct log_c *lc = container_of(work, struct log_c, flush_log_work.work);
164
165 atomic_set(&lc->sched_flush, 0);
166
167 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL);
168
169 if (r)
170 dm_table_event(lc->ti->table);
171}
172
144/* 173/*
145 * userspace_ctr 174 * userspace_ctr
146 * 175 *
147 * argv contains: 176 * argv contains:
148 * <UUID> <other args> 177 * <UUID> [integrated_flush] <other args>
149 * Where 'other args' is the userspace implementation specific log 178 * Where 'other args' are the userspace implementation-specific log
150 * arguments. An example might be: 179 * arguments.
151 * <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync] 180 *
181 * Example:
182 * <UUID> [integrated_flush] clustered-disk <arg count> <log dev>
183 * <region_size> [[no]sync]
184 *
185 * This module strips off the <UUID> and uses it for identification
186 * purposes when communicating with userspace about a log.
152 * 187 *
153 * So, this module will strip off the <UUID> for identification purposes 188 * If integrated_flush is defined, the kernel combines flush
154 * when communicating with userspace about a log; but will pass on everything 189 * and mark requests.
155 * else. 190 *
191 * The rest of the line, beginning with 'clustered-disk', is passed
192 * to the userspace ctr function.
156 */ 193 */
157static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, 194static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
158 unsigned argc, char **argv) 195 unsigned argc, char **argv)
@@ -188,12 +225,22 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
188 return -EINVAL; 225 return -EINVAL;
189 } 226 }
190 227
228 lc->usr_argc = argc;
229
191 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 230 strncpy(lc->uuid, argv[0], DM_UUID_LEN);
231 argc--;
232 argv++;
192 spin_lock_init(&lc->flush_lock); 233 spin_lock_init(&lc->flush_lock);
193 INIT_LIST_HEAD(&lc->mark_list); 234 INIT_LIST_HEAD(&lc->mark_list);
194 INIT_LIST_HEAD(&lc->clear_list); 235 INIT_LIST_HEAD(&lc->clear_list);
195 236
196 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 237 if (!strcasecmp(argv[0], "integrated_flush")) {
238 lc->integrated_flush = 1;
239 argc--;
240 argv++;
241 }
242
243 str_size = build_constructor_string(ti, argc, argv, &ctr_str);
197 if (str_size < 0) { 244 if (str_size < 0) {
198 kfree(lc); 245 kfree(lc);
199 return str_size; 246 return str_size;
@@ -246,6 +293,19 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
246 DMERR("Failed to register %s with device-mapper", 293 DMERR("Failed to register %s with device-mapper",
247 devices_rdata); 294 devices_rdata);
248 } 295 }
296
297 if (lc->integrated_flush) {
298 lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0);
299 if (!lc->dmlog_wq) {
300 DMERR("couldn't start dmlogd");
301 r = -ENOMEM;
302 goto out;
303 }
304
305 INIT_DELAYED_WORK(&lc->flush_log_work, do_flush);
306 atomic_set(&lc->sched_flush, 0);
307 }
308
249out: 309out:
250 kfree(devices_rdata); 310 kfree(devices_rdata);
251 if (r) { 311 if (r) {
@@ -253,7 +313,6 @@ out:
253 kfree(ctr_str); 313 kfree(ctr_str);
254 } else { 314 } else {
255 lc->usr_argv_str = ctr_str; 315 lc->usr_argv_str = ctr_str;
256 lc->usr_argc = argc;
257 log->context = lc; 316 log->context = lc;
258 } 317 }
259 318
@@ -264,9 +323,16 @@ static void userspace_dtr(struct dm_dirty_log *log)
264{ 323{
265 struct log_c *lc = log->context; 324 struct log_c *lc = log->context;
266 325
326 if (lc->integrated_flush) {
327 /* flush workqueue */
328 if (atomic_read(&lc->sched_flush))
329 flush_delayed_work(&lc->flush_log_work);
330
331 destroy_workqueue(lc->dmlog_wq);
332 }
333
267 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 334 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
268 NULL, 0, 335 NULL, 0, NULL, NULL);
269 NULL, NULL);
270 336
271 if (lc->log_dev) 337 if (lc->log_dev)
272 dm_put_device(lc->ti, lc->log_dev); 338 dm_put_device(lc->ti, lc->log_dev);
@@ -283,8 +349,7 @@ static int userspace_presuspend(struct dm_dirty_log *log)
283 struct log_c *lc = log->context; 349 struct log_c *lc = log->context;
284 350
285 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, 351 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
286 NULL, 0, 352 NULL, 0, NULL, NULL);
287 NULL, NULL);
288 353
289 return r; 354 return r;
290} 355}
@@ -294,9 +359,14 @@ static int userspace_postsuspend(struct dm_dirty_log *log)
294 int r; 359 int r;
295 struct log_c *lc = log->context; 360 struct log_c *lc = log->context;
296 361
362 /*
363 * Run planned flush earlier.
364 */
365 if (lc->integrated_flush && atomic_read(&lc->sched_flush))
366 flush_delayed_work(&lc->flush_log_work);
367
297 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, 368 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
298 NULL, 0, 369 NULL, 0, NULL, NULL);
299 NULL, NULL);
300 370
301 return r; 371 return r;
302} 372}
@@ -308,8 +378,7 @@ static int userspace_resume(struct dm_dirty_log *log)
308 378
309 lc->in_sync_hint = 0; 379 lc->in_sync_hint = 0;
310 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, 380 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
311 NULL, 0, 381 NULL, 0, NULL, NULL);
312 NULL, NULL);
313 382
314 return r; 383 return r;
315} 384}
@@ -405,7 +474,8 @@ static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
405 return r; 474 return r;
406} 475}
407 476
408static int flush_by_group(struct log_c *lc, struct list_head *flush_list) 477static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
478 int flush_with_payload)
409{ 479{
410 int r = 0; 480 int r = 0;
411 int count; 481 int count;
@@ -431,15 +501,29 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
431 break; 501 break;
432 } 502 }
433 503
434 r = userspace_do_request(lc, lc->uuid, type, 504 if (flush_with_payload) {
435 (char *)(group), 505 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
436 count * sizeof(uint64_t), 506 (char *)(group),
437 NULL, NULL); 507 count * sizeof(uint64_t),
438 if (r) { 508 NULL, NULL);
439 /* Group send failed. Attempt one-by-one. */ 509 /*
440 list_splice_init(&tmp_list, flush_list); 510 * Integrated flush failed.
441 r = flush_one_by_one(lc, flush_list); 511 */
442 break; 512 if (r)
513 break;
514 } else {
515 r = userspace_do_request(lc, lc->uuid, type,
516 (char *)(group),
517 count * sizeof(uint64_t),
518 NULL, NULL);
519 if (r) {
520 /*
521 * Group send failed. Attempt one-by-one.
522 */
523 list_splice_init(&tmp_list, flush_list);
524 r = flush_one_by_one(lc, flush_list);
525 break;
526 }
443 } 527 }
444 } 528 }
445 529
@@ -476,6 +560,8 @@ static int userspace_flush(struct dm_dirty_log *log)
476 struct log_c *lc = log->context; 560 struct log_c *lc = log->context;
477 LIST_HEAD(mark_list); 561 LIST_HEAD(mark_list);
478 LIST_HEAD(clear_list); 562 LIST_HEAD(clear_list);
563 int mark_list_is_empty;
564 int clear_list_is_empty;
479 struct flush_entry *fe, *tmp_fe; 565 struct flush_entry *fe, *tmp_fe;
480 566
481 spin_lock_irqsave(&lc->flush_lock, flags); 567 spin_lock_irqsave(&lc->flush_lock, flags);
@@ -483,23 +569,51 @@ static int userspace_flush(struct dm_dirty_log *log)
483 list_splice_init(&lc->clear_list, &clear_list); 569 list_splice_init(&lc->clear_list, &clear_list);
484 spin_unlock_irqrestore(&lc->flush_lock, flags); 570 spin_unlock_irqrestore(&lc->flush_lock, flags);
485 571
486 if (list_empty(&mark_list) && list_empty(&clear_list)) 572 mark_list_is_empty = list_empty(&mark_list);
573 clear_list_is_empty = list_empty(&clear_list);
574
575 if (mark_list_is_empty && clear_list_is_empty)
487 return 0; 576 return 0;
488 577
489 r = flush_by_group(lc, &mark_list); 578 r = flush_by_group(lc, &clear_list, 0);
490 if (r) 579 if (r)
491 goto fail; 580 goto out;
492 581
493 r = flush_by_group(lc, &clear_list); 582 if (!lc->integrated_flush) {
583 r = flush_by_group(lc, &mark_list, 0);
584 if (r)
585 goto out;
586 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
587 NULL, 0, NULL, NULL);
588 goto out;
589 }
590
591 /*
592 * Send integrated flush request with mark_list as payload.
593 */
594 r = flush_by_group(lc, &mark_list, 1);
494 if (r) 595 if (r)
495 goto fail; 596 goto out;
496 597
497 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 598 if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) {
498 NULL, 0, NULL, NULL); 599 /*
600 * When there are only clear region requests,
601 * we schedule a flush in the future.
602 */
603 queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ);
604 atomic_set(&lc->sched_flush, 1);
605 } else {
606 /*
607 * Cancel pending flush because we
608 * have already flushed in mark_region.
609 */
610 cancel_delayed_work(&lc->flush_log_work);
611 atomic_set(&lc->sched_flush, 0);
612 }
499 613
500fail: 614out:
501 /* 615 /*
502 * We can safely remove these entries, even if failure. 616 * We can safely remove these entries, even after failure.
503 * Calling code will receive an error and will know that 617 * Calling code will receive an error and will know that
504 * the log facility has failed. 618 * the log facility has failed.
505 */ 619 */
@@ -603,8 +717,7 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
603 717
604 rdata_size = sizeof(pkg); 718 rdata_size = sizeof(pkg);
605 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, 719 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
606 NULL, 0, 720 NULL, 0, (char *)&pkg, &rdata_size);
607 (char *)&pkg, &rdata_size);
608 721
609 *region = pkg.r; 722 *region = pkg.r;
610 return (r) ? r : (int)pkg.i; 723 return (r) ? r : (int)pkg.i;
@@ -630,8 +743,7 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
630 pkg.i = (int64_t)in_sync; 743 pkg.i = (int64_t)in_sync;
631 744
632 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, 745 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
633 (char *)&pkg, sizeof(pkg), 746 (char *)&pkg, sizeof(pkg), NULL, NULL);
634 NULL, NULL);
635 747
636 /* 748 /*
637 * It would be nice to be able to report failures. 749 * It would be nice to be able to report failures.
@@ -657,8 +769,7 @@ static region_t userspace_get_sync_count(struct dm_dirty_log *log)
657 769
658 rdata_size = sizeof(sync_count); 770 rdata_size = sizeof(sync_count);
659 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, 771 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
660 NULL, 0, 772 NULL, 0, (char *)&sync_count, &rdata_size);
661 (char *)&sync_count, &rdata_size);
662 773
663 if (r) 774 if (r)
664 return 0; 775 return 0;
@@ -685,8 +796,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
685 switch (status_type) { 796 switch (status_type) {
686 case STATUSTYPE_INFO: 797 case STATUSTYPE_INFO:
687 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, 798 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
688 NULL, 0, 799 NULL, 0, result, &sz);
689 result, &sz);
690 800
691 if (r) { 801 if (r) {
692 sz = 0; 802 sz = 0;
@@ -699,8 +809,10 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
699 BUG_ON(!table_args); /* There will always be a ' ' */ 809 BUG_ON(!table_args); /* There will always be a ' ' */
700 table_args++; 810 table_args++;
701 811
702 DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, 812 DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid);
703 lc->uuid, table_args); 813 if (lc->integrated_flush)
814 DMEMIT("integrated_flush ");
815 DMEMIT("%s ", table_args);
704 break; 816 break;
705 } 817 }
706 return (r) ? 0 : (int)sz; 818 return (r) ? 0 : (int)sz;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6eb9dc9ef8f3..422a9fdeb53e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1626 /* 1626 /*
1627 * Only pass ioctls through if the device sizes match exactly. 1627 * Only pass ioctls through if the device sizes match exactly.
1628 */ 1628 */
1629 if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) 1629 if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
1630 r = scsi_verify_blk_ioctl(NULL, cmd); 1630 int err = scsi_verify_blk_ioctl(NULL, cmd);
1631 if (err)
1632 r = err;
1633 }
1631 1634
1632 if (r == -ENOTCONN && !fatal_signal_pending(current)) 1635 if (r == -ENOTCONN && !fatal_signal_pending(current))
1633 queue_work(kmultipathd, &m->process_queued_ios); 1636 queue_work(kmultipathd, &m->process_queued_ios);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9584443c5614..7dfdb5c746d6 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -432,7 +432,7 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
432 region_t region = dm_rh_bio_to_region(ms->rh, bio); 432 region_t region = dm_rh_bio_to_region(ms->rh, bio);
433 433
434 if (log->type->in_sync(log, region, 0)) 434 if (log->type->in_sync(log, region, 0))
435 return choose_mirror(ms, bio->bi_sector) ? 1 : 0; 435 return choose_mirror(ms, bio->bi_iter.bi_sector) ? 1 : 0;
436 436
437 return 0; 437 return 0;
438} 438}
@@ -442,15 +442,15 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
442 */ 442 */
443static sector_t map_sector(struct mirror *m, struct bio *bio) 443static sector_t map_sector(struct mirror *m, struct bio *bio)
444{ 444{
445 if (unlikely(!bio->bi_size)) 445 if (unlikely(!bio->bi_iter.bi_size))
446 return 0; 446 return 0;
447 return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector); 447 return m->offset + dm_target_offset(m->ms->ti, bio->bi_iter.bi_sector);
448} 448}
449 449
450static void map_bio(struct mirror *m, struct bio *bio) 450static void map_bio(struct mirror *m, struct bio *bio)
451{ 451{
452 bio->bi_bdev = m->dev->bdev; 452 bio->bi_bdev = m->dev->bdev;
453 bio->bi_sector = map_sector(m, bio); 453 bio->bi_iter.bi_sector = map_sector(m, bio);
454} 454}
455 455
456static void map_region(struct dm_io_region *io, struct mirror *m, 456static void map_region(struct dm_io_region *io, struct mirror *m,
@@ -526,8 +526,8 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
526 struct dm_io_region io; 526 struct dm_io_region io;
527 struct dm_io_request io_req = { 527 struct dm_io_request io_req = {
528 .bi_rw = READ, 528 .bi_rw = READ,
529 .mem.type = DM_IO_BVEC, 529 .mem.type = DM_IO_BIO,
530 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 530 .mem.ptr.bio = bio,
531 .notify.fn = read_callback, 531 .notify.fn = read_callback,
532 .notify.context = bio, 532 .notify.context = bio,
533 .client = m->ms->io_client, 533 .client = m->ms->io_client,
@@ -559,7 +559,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
559 * We can only read balance if the region is in sync. 559 * We can only read balance if the region is in sync.
560 */ 560 */
561 if (likely(region_in_sync(ms, region, 1))) 561 if (likely(region_in_sync(ms, region, 1)))
562 m = choose_mirror(ms, bio->bi_sector); 562 m = choose_mirror(ms, bio->bi_iter.bi_sector);
563 else if (m && atomic_read(&m->error_count)) 563 else if (m && atomic_read(&m->error_count))
564 m = NULL; 564 m = NULL;
565 565
@@ -629,8 +629,8 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
629 struct mirror *m; 629 struct mirror *m;
630 struct dm_io_request io_req = { 630 struct dm_io_request io_req = {
631 .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), 631 .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
632 .mem.type = DM_IO_BVEC, 632 .mem.type = DM_IO_BIO,
633 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 633 .mem.ptr.bio = bio,
634 .notify.fn = write_callback, 634 .notify.fn = write_callback,
635 .notify.context = bio, 635 .notify.context = bio,
636 .client = ms->io_client, 636 .client = ms->io_client,
@@ -1181,7 +1181,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
1181 * The region is in-sync and we can perform reads directly. 1181 * The region is in-sync and we can perform reads directly.
1182 * Store enough information so we can retry if it fails. 1182 * Store enough information so we can retry if it fails.
1183 */ 1183 */
1184 m = choose_mirror(ms, bio->bi_sector); 1184 m = choose_mirror(ms, bio->bi_iter.bi_sector);
1185 if (unlikely(!m)) 1185 if (unlikely(!m))
1186 return -EIO; 1186 return -EIO;
1187 1187
@@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
1244 1244
1245 dm_bio_restore(bd, bio); 1245 dm_bio_restore(bd, bio);
1246 bio_record->details.bi_bdev = NULL; 1246 bio_record->details.bi_bdev = NULL;
1247
1248 atomic_inc(&bio->bi_remaining);
1249
1247 queue_bio(ms, bio, rw); 1250 queue_bio(ms, bio, rw);
1248 return DM_ENDIO_INCOMPLETE; 1251 return DM_ENDIO_INCOMPLETE;
1249 } 1252 }
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 69732e03eb34..b929fd5f4984 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -126,7 +126,8 @@ EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
126 126
127region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) 127region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
128{ 128{
129 return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); 129 return dm_rh_sector_to_region(rh, bio->bi_iter.bi_sector -
130 rh->target_begin);
130} 131}
131EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); 132EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
132 133
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2d2b1b7588d7..d6e88178d22c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -13,10 +13,13 @@
13#include <linux/export.h> 13#include <linux/export.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/dm-io.h> 15#include <linux/dm-io.h>
16#include "dm-bufio.h"
16 17
17#define DM_MSG_PREFIX "persistent snapshot" 18#define DM_MSG_PREFIX "persistent snapshot"
18#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ 19#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
19 20
21#define DM_PREFETCH_CHUNKS 12
22
20/*----------------------------------------------------------------- 23/*-----------------------------------------------------------------
21 * Persistent snapshots, by persistent we mean that the snapshot 24 * Persistent snapshots, by persistent we mean that the snapshot
22 * will survive a reboot. 25 * will survive a reboot.
@@ -257,6 +260,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
257 INIT_WORK_ONSTACK(&req.work, do_metadata); 260 INIT_WORK_ONSTACK(&req.work, do_metadata);
258 queue_work(ps->metadata_wq, &req.work); 261 queue_work(ps->metadata_wq, &req.work);
259 flush_workqueue(ps->metadata_wq); 262 flush_workqueue(ps->metadata_wq);
263 destroy_work_on_stack(&req.work);
260 264
261 return req.result; 265 return req.result;
262} 266}
@@ -401,17 +405,18 @@ static int write_header(struct pstore *ps)
401/* 405/*
402 * Access functions for the disk exceptions, these do the endian conversions. 406 * Access functions for the disk exceptions, these do the endian conversions.
403 */ 407 */
404static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) 408static struct disk_exception *get_exception(struct pstore *ps, void *ps_area,
409 uint32_t index)
405{ 410{
406 BUG_ON(index >= ps->exceptions_per_area); 411 BUG_ON(index >= ps->exceptions_per_area);
407 412
408 return ((struct disk_exception *) ps->area) + index; 413 return ((struct disk_exception *) ps_area) + index;
409} 414}
410 415
411static void read_exception(struct pstore *ps, 416static void read_exception(struct pstore *ps, void *ps_area,
412 uint32_t index, struct core_exception *result) 417 uint32_t index, struct core_exception *result)
413{ 418{
414 struct disk_exception *de = get_exception(ps, index); 419 struct disk_exception *de = get_exception(ps, ps_area, index);
415 420
416 /* copy it */ 421 /* copy it */
417 result->old_chunk = le64_to_cpu(de->old_chunk); 422 result->old_chunk = le64_to_cpu(de->old_chunk);
@@ -421,7 +426,7 @@ static void read_exception(struct pstore *ps,
421static void write_exception(struct pstore *ps, 426static void write_exception(struct pstore *ps,
422 uint32_t index, struct core_exception *e) 427 uint32_t index, struct core_exception *e)
423{ 428{
424 struct disk_exception *de = get_exception(ps, index); 429 struct disk_exception *de = get_exception(ps, ps->area, index);
425 430
426 /* copy it */ 431 /* copy it */
427 de->old_chunk = cpu_to_le64(e->old_chunk); 432 de->old_chunk = cpu_to_le64(e->old_chunk);
@@ -430,7 +435,7 @@ static void write_exception(struct pstore *ps,
430 435
431static void clear_exception(struct pstore *ps, uint32_t index) 436static void clear_exception(struct pstore *ps, uint32_t index)
432{ 437{
433 struct disk_exception *de = get_exception(ps, index); 438 struct disk_exception *de = get_exception(ps, ps->area, index);
434 439
435 /* clear it */ 440 /* clear it */
436 de->old_chunk = 0; 441 de->old_chunk = 0;
@@ -442,7 +447,7 @@ static void clear_exception(struct pstore *ps, uint32_t index)
442 * 'full' is filled in to indicate if the area has been 447 * 'full' is filled in to indicate if the area has been
443 * filled. 448 * filled.
444 */ 449 */
445static int insert_exceptions(struct pstore *ps, 450static int insert_exceptions(struct pstore *ps, void *ps_area,
446 int (*callback)(void *callback_context, 451 int (*callback)(void *callback_context,
447 chunk_t old, chunk_t new), 452 chunk_t old, chunk_t new),
448 void *callback_context, 453 void *callback_context,
@@ -456,7 +461,7 @@ static int insert_exceptions(struct pstore *ps,
456 *full = 1; 461 *full = 1;
457 462
458 for (i = 0; i < ps->exceptions_per_area; i++) { 463 for (i = 0; i < ps->exceptions_per_area; i++) {
459 read_exception(ps, i, &e); 464 read_exception(ps, ps_area, i, &e);
460 465
461 /* 466 /*
462 * If the new_chunk is pointing at the start of 467 * If the new_chunk is pointing at the start of
@@ -493,26 +498,75 @@ static int read_exceptions(struct pstore *ps,
493 void *callback_context) 498 void *callback_context)
494{ 499{
495 int r, full = 1; 500 int r, full = 1;
501 struct dm_bufio_client *client;
502 chunk_t prefetch_area = 0;
503
504 client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
505 ps->store->chunk_size << SECTOR_SHIFT,
506 1, 0, NULL, NULL);
507
508 if (IS_ERR(client))
509 return PTR_ERR(client);
510
511 /*
512 * Setup for one current buffer + desired readahead buffers.
513 */
514 dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS);
496 515
497 /* 516 /*
498 * Keeping reading chunks and inserting exceptions until 517 * Keeping reading chunks and inserting exceptions until
499 * we find a partially full area. 518 * we find a partially full area.
500 */ 519 */
501 for (ps->current_area = 0; full; ps->current_area++) { 520 for (ps->current_area = 0; full; ps->current_area++) {
502 r = area_io(ps, READ); 521 struct dm_buffer *bp;
503 if (r) 522 void *area;
504 return r; 523 chunk_t chunk;
524
525 if (unlikely(prefetch_area < ps->current_area))
526 prefetch_area = ps->current_area;
527
528 if (DM_PREFETCH_CHUNKS) do {
529 chunk_t pf_chunk = area_location(ps, prefetch_area);
530 if (unlikely(pf_chunk >= dm_bufio_get_device_size(client)))
531 break;
532 dm_bufio_prefetch(client, pf_chunk, 1);
533 prefetch_area++;
534 if (unlikely(!prefetch_area))
535 break;
536 } while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS);
537
538 chunk = area_location(ps, ps->current_area);
539
540 area = dm_bufio_read(client, chunk, &bp);
541 if (unlikely(IS_ERR(area))) {
542 r = PTR_ERR(area);
543 goto ret_destroy_bufio;
544 }
505 545
506 r = insert_exceptions(ps, callback, callback_context, &full); 546 r = insert_exceptions(ps, area, callback, callback_context,
507 if (r) 547 &full);
508 return r; 548
549 if (!full)
550 memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
551
552 dm_bufio_release(bp);
553
554 dm_bufio_forget(client, chunk);
555
556 if (unlikely(r))
557 goto ret_destroy_bufio;
509 } 558 }
510 559
511 ps->current_area--; 560 ps->current_area--;
512 561
513 skip_metadata(ps); 562 skip_metadata(ps);
514 563
515 return 0; 564 r = 0;
565
566ret_destroy_bufio:
567 dm_bufio_client_destroy(client);
568
569 return r;
516} 570}
517 571
518static struct pstore *get_info(struct dm_exception_store *store) 572static struct pstore *get_info(struct dm_exception_store *store)
@@ -733,7 +787,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
733 ps->current_committed = ps->exceptions_per_area; 787 ps->current_committed = ps->exceptions_per_area;
734 } 788 }
735 789
736 read_exception(ps, ps->current_committed - 1, &ce); 790 read_exception(ps, ps->area, ps->current_committed - 1, &ce);
737 *last_old_chunk = ce.old_chunk; 791 *last_old_chunk = ce.old_chunk;
738 *last_new_chunk = ce.new_chunk; 792 *last_new_chunk = ce.new_chunk;
739 793
@@ -743,8 +797,8 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
743 */ 797 */
744 for (nr_consecutive = 1; nr_consecutive < ps->current_committed; 798 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
745 nr_consecutive++) { 799 nr_consecutive++) {
746 read_exception(ps, ps->current_committed - 1 - nr_consecutive, 800 read_exception(ps, ps->area,
747 &ce); 801 ps->current_committed - 1 - nr_consecutive, &ce);
748 if (ce.old_chunk != *last_old_chunk - nr_consecutive || 802 if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
749 ce.new_chunk != *last_new_chunk - nr_consecutive) 803 ce.new_chunk != *last_new_chunk - nr_consecutive)
750 break; 804 break;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 944690bafd93..ebddef5237e4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -610,12 +610,12 @@ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
610 return NULL; 610 return NULL;
611} 611}
612 612
613static struct dm_exception *alloc_completed_exception(void) 613static struct dm_exception *alloc_completed_exception(gfp_t gfp)
614{ 614{
615 struct dm_exception *e; 615 struct dm_exception *e;
616 616
617 e = kmem_cache_alloc(exception_cache, GFP_NOIO); 617 e = kmem_cache_alloc(exception_cache, gfp);
618 if (!e) 618 if (!e && gfp == GFP_NOIO)
619 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); 619 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
620 620
621 return e; 621 return e;
@@ -697,7 +697,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
697 struct dm_snapshot *s = context; 697 struct dm_snapshot *s = context;
698 struct dm_exception *e; 698 struct dm_exception *e;
699 699
700 e = alloc_completed_exception(); 700 e = alloc_completed_exception(GFP_KERNEL);
701 if (!e) 701 if (!e)
702 return -ENOMEM; 702 return -ENOMEM;
703 703
@@ -1405,7 +1405,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1405 goto out; 1405 goto out;
1406 } 1406 }
1407 1407
1408 e = alloc_completed_exception(); 1408 e = alloc_completed_exception(GFP_NOIO);
1409 if (!e) { 1409 if (!e) {
1410 down_write(&s->lock); 1410 down_write(&s->lock);
1411 __invalidate_snapshot(s, -ENOMEM); 1411 __invalidate_snapshot(s, -ENOMEM);
@@ -1438,6 +1438,7 @@ out:
1438 if (full_bio) { 1438 if (full_bio) {
1439 full_bio->bi_end_io = pe->full_bio_end_io; 1439 full_bio->bi_end_io = pe->full_bio_end_io;
1440 full_bio->bi_private = pe->full_bio_private; 1440 full_bio->bi_private = pe->full_bio_private;
1441 atomic_inc(&full_bio->bi_remaining);
1441 } 1442 }
1442 free_pending_exception(pe); 1443 free_pending_exception(pe);
1443 1444
@@ -1619,11 +1620,10 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1619 struct bio *bio, chunk_t chunk) 1620 struct bio *bio, chunk_t chunk)
1620{ 1621{
1621 bio->bi_bdev = s->cow->bdev; 1622 bio->bi_bdev = s->cow->bdev;
1622 bio->bi_sector = chunk_to_sector(s->store, 1623 bio->bi_iter.bi_sector =
1623 dm_chunk_number(e->new_chunk) + 1624 chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
1624 (chunk - e->old_chunk)) + 1625 (chunk - e->old_chunk)) +
1625 (bio->bi_sector & 1626 (bio->bi_iter.bi_sector & s->store->chunk_mask);
1626 s->store->chunk_mask);
1627} 1627}
1628 1628
1629static int snapshot_map(struct dm_target *ti, struct bio *bio) 1629static int snapshot_map(struct dm_target *ti, struct bio *bio)
@@ -1641,7 +1641,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1641 return DM_MAPIO_REMAPPED; 1641 return DM_MAPIO_REMAPPED;
1642 } 1642 }
1643 1643
1644 chunk = sector_to_chunk(s->store, bio->bi_sector); 1644 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
1645 1645
1646 /* Full snapshots are not usable */ 1646 /* Full snapshots are not usable */
1647 /* To get here the table must be live so s->active is always set. */ 1647 /* To get here the table must be live so s->active is always set. */
@@ -1702,7 +1702,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1702 r = DM_MAPIO_SUBMITTED; 1702 r = DM_MAPIO_SUBMITTED;
1703 1703
1704 if (!pe->started && 1704 if (!pe->started &&
1705 bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { 1705 bio->bi_iter.bi_size ==
1706 (s->store->chunk_size << SECTOR_SHIFT)) {
1706 pe->started = 1; 1707 pe->started = 1;
1707 up_write(&s->lock); 1708 up_write(&s->lock);
1708 start_full_bio(pe, bio); 1709 start_full_bio(pe, bio);
@@ -1758,7 +1759,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
1758 return DM_MAPIO_REMAPPED; 1759 return DM_MAPIO_REMAPPED;
1759 } 1760 }
1760 1761
1761 chunk = sector_to_chunk(s->store, bio->bi_sector); 1762 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
1762 1763
1763 down_write(&s->lock); 1764 down_write(&s->lock);
1764 1765
@@ -2095,7 +2096,7 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
2095 down_read(&_origins_lock); 2096 down_read(&_origins_lock);
2096 o = __lookup_origin(origin->bdev); 2097 o = __lookup_origin(origin->bdev);
2097 if (o) 2098 if (o)
2098 r = __origin_write(&o->snapshots, bio->bi_sector, bio); 2099 r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
2099 up_read(&_origins_lock); 2100 up_read(&_origins_lock);
2100 2101
2101 return r; 2102 return r;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 73c1712dad96..d1600d2aa2e2 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -259,13 +259,15 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
259{ 259{
260 sector_t begin, end; 260 sector_t begin, end;
261 261
262 stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); 262 stripe_map_range_sector(sc, bio->bi_iter.bi_sector,
263 target_stripe, &begin);
263 stripe_map_range_sector(sc, bio_end_sector(bio), 264 stripe_map_range_sector(sc, bio_end_sector(bio),
264 target_stripe, &end); 265 target_stripe, &end);
265 if (begin < end) { 266 if (begin < end) {
266 bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; 267 bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
267 bio->bi_sector = begin + sc->stripe[target_stripe].physical_start; 268 bio->bi_iter.bi_sector = begin +
268 bio->bi_size = to_bytes(end - begin); 269 sc->stripe[target_stripe].physical_start;
270 bio->bi_iter.bi_size = to_bytes(end - begin);
269 return DM_MAPIO_REMAPPED; 271 return DM_MAPIO_REMAPPED;
270 } else { 272 } else {
271 /* The range doesn't map to the target stripe */ 273 /* The range doesn't map to the target stripe */
@@ -293,9 +295,10 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
293 return stripe_map_range(sc, bio, target_bio_nr); 295 return stripe_map_range(sc, bio, target_bio_nr);
294 } 296 }
295 297
296 stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); 298 stripe_map_sector(sc, bio->bi_iter.bi_sector,
299 &stripe, &bio->bi_iter.bi_sector);
297 300
298 bio->bi_sector += sc->stripe[stripe].physical_start; 301 bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start;
299 bio->bi_bdev = sc->stripe[stripe].dev->bdev; 302 bio->bi_bdev = sc->stripe[stripe].dev->bdev;
300 303
301 return DM_MAPIO_REMAPPED; 304 return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index ff9ac4be4721..09a688b3d48c 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -311,11 +311,11 @@ error:
311static int switch_map(struct dm_target *ti, struct bio *bio) 311static int switch_map(struct dm_target *ti, struct bio *bio)
312{ 312{
313 struct switch_ctx *sctx = ti->private; 313 struct switch_ctx *sctx = ti->private;
314 sector_t offset = dm_target_offset(ti, bio->bi_sector); 314 sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
315 unsigned path_nr = switch_get_path_nr(sctx, offset); 315 unsigned path_nr = switch_get_path_nr(sctx, offset);
316 316
317 bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; 317 bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
318 bio->bi_sector = sctx->path_list[path_nr].start + offset; 318 bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
319 319
320 return DM_MAPIO_REMAPPED; 320 return DM_MAPIO_REMAPPED;
321} 321}
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 84d2b91e4efb..c62c5ab6aed5 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -86,6 +86,7 @@ static const struct sysfs_ops dm_sysfs_ops = {
86static struct kobj_type dm_ktype = { 86static struct kobj_type dm_ktype = {
87 .sysfs_ops = &dm_sysfs_ops, 87 .sysfs_ops = &dm_sysfs_ops,
88 .default_attrs = dm_attrs, 88 .default_attrs = dm_attrs,
89 .release = dm_kobject_release,
89}; 90};
90 91
91/* 92/*
@@ -104,5 +105,7 @@ int dm_sysfs_init(struct mapped_device *md)
104 */ 105 */
105void dm_sysfs_exit(struct mapped_device *md) 106void dm_sysfs_exit(struct mapped_device *md)
106{ 107{
107 kobject_put(dm_kobject(md)); 108 struct kobject *kobj = dm_kobject(md);
109 kobject_put(kobj);
110 wait_for_completion(dm_get_completion_from_kobject(kobj));
108} 111}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ba6a3859ce3..6a7f2b83a126 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -155,7 +155,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
155{ 155{
156 sector_t *n_highs; 156 sector_t *n_highs;
157 struct dm_target *n_targets; 157 struct dm_target *n_targets;
158 int n = t->num_targets;
159 158
160 /* 159 /*
161 * Allocate both the target array and offset array at once. 160 * Allocate both the target array and offset array at once.
@@ -169,12 +168,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
169 168
170 n_targets = (struct dm_target *) (n_highs + num); 169 n_targets = (struct dm_target *) (n_highs + num);
171 170
172 if (n) { 171 memset(n_highs, -1, sizeof(*n_highs) * num);
173 memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
174 memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
175 }
176
177 memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
178 vfree(t->highs); 172 vfree(t->highs);
179 173
180 t->num_allocated = num; 174 t->num_allocated = num;
@@ -261,17 +255,6 @@ void dm_table_destroy(struct dm_table *t)
261} 255}
262 256
263/* 257/*
264 * Checks to see if we need to extend highs or targets.
265 */
266static inline int check_space(struct dm_table *t)
267{
268 if (t->num_targets >= t->num_allocated)
269 return alloc_targets(t, t->num_allocated * 2);
270
271 return 0;
272}
273
274/*
275 * See if we've already got a device in the list. 258 * See if we've already got a device in the list.
276 */ 259 */
277static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) 260static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
@@ -731,8 +714,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
731 return -EINVAL; 714 return -EINVAL;
732 } 715 }
733 716
734 if ((r = check_space(t))) 717 BUG_ON(t->num_targets >= t->num_allocated);
735 return r;
736 718
737 tgt = t->targets + t->num_targets; 719 tgt = t->targets + t->num_targets;
738 memset(tgt, 0, sizeof(*tgt)); 720 memset(tgt, 0, sizeof(*tgt));
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 8a30ad54bd46..fb9efc829182 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
76 76
77#define THIN_SUPERBLOCK_MAGIC 27022010 77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0 78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 1 79#define THIN_VERSION 2
80#define THIN_METADATA_CACHE_SIZE 64 80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3 81#define SECTOR_TO_BLOCK_SHIFT 3
82 82
@@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
483 483
484 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 484 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
485 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 485 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); 487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); 488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
489 489
@@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
651{ 651{
652 int r; 652 int r;
653 653
654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE, 654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
655 THIN_METADATA_CACHE_SIZE, 655 THIN_METADATA_CACHE_SIZE,
656 THIN_MAX_CONCURRENT_LOCKS); 656 THIN_MAX_CONCURRENT_LOCKS);
657 if (IS_ERR(pmd->bm)) { 657 if (IS_ERR(pmd->bm)) {
@@ -1349,6 +1349,12 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1349 return td->id; 1349 return td->id;
1350} 1350}
1351 1351
1352/*
1353 * Check whether @time (of block creation) is older than @td's last snapshot.
1354 * If so then the associated block is shared with the last snapshot device.
1355 * Any block on a device created *after* the device last got snapshotted is
1356 * necessarily not shared.
1357 */
1352static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1358static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1353{ 1359{
1354 return td->snapshotted_time > time; 1360 return td->snapshotted_time > time;
@@ -1458,6 +1464,20 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1458 return r; 1464 return r;
1459} 1465}
1460 1466
1467int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1468{
1469 int r;
1470 uint32_t ref_count;
1471
1472 down_read(&pmd->root_lock);
1473 r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1474 if (!r)
1475 *result = (ref_count != 0);
1476 up_read(&pmd->root_lock);
1477
1478 return r;
1479}
1480
1461bool dm_thin_changed_this_transaction(struct dm_thin_device *td) 1481bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1462{ 1482{
1463 int r; 1483 int r;
@@ -1469,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1469 return r; 1489 return r;
1470} 1490}
1471 1491
1492bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1493{
1494 bool r = false;
1495 struct dm_thin_device *td, *tmp;
1496
1497 down_read(&pmd->root_lock);
1498 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1499 if (td->changed) {
1500 r = td->changed;
1501 break;
1502 }
1503 }
1504 up_read(&pmd->root_lock);
1505
1506 return r;
1507}
1508
1472bool dm_thin_aborted_changes(struct dm_thin_device *td) 1509bool dm_thin_aborted_changes(struct dm_thin_device *td)
1473{ 1510{
1474 bool r; 1511 bool r;
@@ -1718,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1718 1755
1719 return r; 1756 return r;
1720} 1757}
1758
1759int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
1760{
1761 int r;
1762 struct dm_block *sblock;
1763 struct thin_disk_superblock *disk_super;
1764
1765 down_write(&pmd->root_lock);
1766 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
1767
1768 r = superblock_lock(pmd, &sblock);
1769 if (r) {
1770 DMERR("couldn't read superblock");
1771 goto out;
1772 }
1773
1774 disk_super = dm_block_data(sblock);
1775 disk_super->flags = cpu_to_le32(pmd->flags);
1776
1777 dm_bm_unlock(sblock);
1778out:
1779 up_write(&pmd->root_lock);
1780 return r;
1781}
1782
1783bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1784{
1785 bool needs_check;
1786
1787 down_read(&pmd->root_lock);
1788 needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
1789 up_read(&pmd->root_lock);
1790
1791 return needs_check;
1792}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 7bcc0e1d6238..e3c857db195a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -9,16 +9,14 @@
9 9
10#include "persistent-data/dm-block-manager.h" 10#include "persistent-data/dm-block-manager.h"
11#include "persistent-data/dm-space-map.h" 11#include "persistent-data/dm-space-map.h"
12#include "persistent-data/dm-space-map-metadata.h"
12 13
13#define THIN_METADATA_BLOCK_SIZE 4096 14#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
14 15
15/* 16/*
16 * The metadata device is currently limited in size. 17 * The metadata device is currently limited in size.
17 *
18 * We have one block of index, which can hold 255 index entries. Each
19 * index entry contains allocation info about 16k metadata blocks.
20 */ 18 */
21#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) 19#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
22 20
23/* 21/*
24 * A metadata device larger than 16GB triggers a warning. 22 * A metadata device larger than 16GB triggers a warning.
@@ -27,6 +25,11 @@
27 25
28/*----------------------------------------------------------------*/ 26/*----------------------------------------------------------------*/
29 27
28/*
29 * Thin metadata superblock flags.
30 */
31#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
32
30struct dm_pool_metadata; 33struct dm_pool_metadata;
31struct dm_thin_device; 34struct dm_thin_device;
32 35
@@ -131,7 +134,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
131 134
132struct dm_thin_lookup_result { 135struct dm_thin_lookup_result {
133 dm_block_t block; 136 dm_block_t block;
134 unsigned shared:1; 137 bool shared:1;
135}; 138};
136 139
137/* 140/*
@@ -161,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
161 */ 164 */
162bool dm_thin_changed_this_transaction(struct dm_thin_device *td); 165bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
163 166
167bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
168
164bool dm_thin_aborted_changes(struct dm_thin_device *td); 169bool dm_thin_aborted_changes(struct dm_thin_device *td);
165 170
166int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 171int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
@@ -181,6 +186,8 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
181 186
182int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); 187int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
183 188
189int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
190
184/* 191/*
185 * Returns -ENOSPC if the new size is too small and already allocated 192 * Returns -ENOSPC if the new size is too small and already allocated
186 * blocks would be lost. 193 * blocks would be lost.
@@ -200,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
200 dm_sm_threshold_fn fn, 207 dm_sm_threshold_fn fn,
201 void *context); 208 void *context);
202 209
210/*
211 * Updates the superblock immediately.
212 */
213int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
214bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
215
203/*----------------------------------------------------------------*/ 216/*----------------------------------------------------------------*/
204 217
205#endif 218#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index ee29037ffc2e..be70d38745f7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
130struct dm_thin_new_mapping; 130struct dm_thin_new_mapping;
131 131
132/* 132/*
133 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 133 * The pool runs in 4 modes. Ordered in degraded order for comparisons.
134 */ 134 */
135enum pool_mode { 135enum pool_mode {
136 PM_WRITE, /* metadata may be changed */ 136 PM_WRITE, /* metadata may be changed */
137 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
137 PM_READ_ONLY, /* metadata may not be changed */ 138 PM_READ_ONLY, /* metadata may not be changed */
138 PM_FAIL, /* all I/O fails */ 139 PM_FAIL, /* all I/O fails */
139}; 140};
@@ -144,6 +145,7 @@ struct pool_features {
144 bool zero_new_blocks:1; 145 bool zero_new_blocks:1;
145 bool discard_enabled:1; 146 bool discard_enabled:1;
146 bool discard_passdown:1; 147 bool discard_passdown:1;
148 bool error_if_no_space:1;
147}; 149};
148 150
149struct thin_c; 151struct thin_c;
@@ -163,8 +165,7 @@ struct pool {
163 int sectors_per_block_shift; 165 int sectors_per_block_shift;
164 166
165 struct pool_features pf; 167 struct pool_features pf;
166 unsigned low_water_triggered:1; /* A dm event has been sent */ 168 bool low_water_triggered:1; /* A dm event has been sent */
167 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
168 169
169 struct dm_bio_prison *prison; 170 struct dm_bio_prison *prison;
170 struct dm_kcopyd_client *copier; 171 struct dm_kcopyd_client *copier;
@@ -198,7 +199,7 @@ struct pool {
198}; 199};
199 200
200static enum pool_mode get_pool_mode(struct pool *pool); 201static enum pool_mode get_pool_mode(struct pool *pool);
201static void set_pool_mode(struct pool *pool, enum pool_mode mode); 202static void metadata_operation_failed(struct pool *pool, const char *op, int r);
202 203
203/* 204/*
204 * Target context for a pool. 205 * Target context for a pool.
@@ -225,6 +226,7 @@ struct thin_c {
225 226
226 struct pool *pool; 227 struct pool *pool;
227 struct dm_thin_device *td; 228 struct dm_thin_device *td;
229 bool requeue_mode:1;
228}; 230};
229 231
230/*----------------------------------------------------------------*/ 232/*----------------------------------------------------------------*/
@@ -368,14 +370,18 @@ struct dm_thin_endio_hook {
368 struct dm_thin_new_mapping *overwrite_mapping; 370 struct dm_thin_new_mapping *overwrite_mapping;
369}; 371};
370 372
371static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 373static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
372{ 374{
373 struct bio *bio; 375 struct bio *bio;
374 struct bio_list bios; 376 struct bio_list bios;
377 unsigned long flags;
375 378
376 bio_list_init(&bios); 379 bio_list_init(&bios);
380
381 spin_lock_irqsave(&tc->pool->lock, flags);
377 bio_list_merge(&bios, master); 382 bio_list_merge(&bios, master);
378 bio_list_init(master); 383 bio_list_init(master);
384 spin_unlock_irqrestore(&tc->pool->lock, flags);
379 385
380 while ((bio = bio_list_pop(&bios))) { 386 while ((bio = bio_list_pop(&bios))) {
381 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 387 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -390,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
390static void requeue_io(struct thin_c *tc) 396static void requeue_io(struct thin_c *tc)
391{ 397{
392 struct pool *pool = tc->pool; 398 struct pool *pool = tc->pool;
399
400 requeue_bio_list(tc, &pool->deferred_bios);
401 requeue_bio_list(tc, &pool->retry_on_resume_list);
402}
403
404static void error_retry_list(struct pool *pool)
405{
406 struct bio *bio;
393 unsigned long flags; 407 unsigned long flags;
408 struct bio_list bios;
409
410 bio_list_init(&bios);
394 411
395 spin_lock_irqsave(&pool->lock, flags); 412 spin_lock_irqsave(&pool->lock, flags);
396 __requeue_bio_list(tc, &pool->deferred_bios); 413 bio_list_merge(&bios, &pool->retry_on_resume_list);
397 __requeue_bio_list(tc, &pool->retry_on_resume_list); 414 bio_list_init(&pool->retry_on_resume_list);
398 spin_unlock_irqrestore(&pool->lock, flags); 415 spin_unlock_irqrestore(&pool->lock, flags);
416
417 while ((bio = bio_list_pop(&bios)))
418 bio_io_error(bio);
399} 419}
400 420
401/* 421/*
@@ -413,7 +433,7 @@ static bool block_size_is_power_of_two(struct pool *pool)
413static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 433static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
414{ 434{
415 struct pool *pool = tc->pool; 435 struct pool *pool = tc->pool;
416 sector_t block_nr = bio->bi_sector; 436 sector_t block_nr = bio->bi_iter.bi_sector;
417 437
418 if (block_size_is_power_of_two(pool)) 438 if (block_size_is_power_of_two(pool))
419 block_nr >>= pool->sectors_per_block_shift; 439 block_nr >>= pool->sectors_per_block_shift;
@@ -426,14 +446,15 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
426static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 446static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
427{ 447{
428 struct pool *pool = tc->pool; 448 struct pool *pool = tc->pool;
429 sector_t bi_sector = bio->bi_sector; 449 sector_t bi_sector = bio->bi_iter.bi_sector;
430 450
431 bio->bi_bdev = tc->pool_dev->bdev; 451 bio->bi_bdev = tc->pool_dev->bdev;
432 if (block_size_is_power_of_two(pool)) 452 if (block_size_is_power_of_two(pool))
433 bio->bi_sector = (block << pool->sectors_per_block_shift) | 453 bio->bi_iter.bi_sector =
434 (bi_sector & (pool->sectors_per_block - 1)); 454 (block << pool->sectors_per_block_shift) |
455 (bi_sector & (pool->sectors_per_block - 1));
435 else 456 else
436 bio->bi_sector = (block * pool->sectors_per_block) + 457 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
437 sector_div(bi_sector, pool->sectors_per_block); 458 sector_div(bi_sector, pool->sectors_per_block);
438} 459}
439 460
@@ -509,15 +530,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
509struct dm_thin_new_mapping { 530struct dm_thin_new_mapping {
510 struct list_head list; 531 struct list_head list;
511 532
512 unsigned quiesced:1; 533 bool quiesced:1;
513 unsigned prepared:1; 534 bool prepared:1;
514 unsigned pass_discard:1; 535 bool pass_discard:1;
536 bool definitely_not_shared:1;
515 537
538 int err;
516 struct thin_c *tc; 539 struct thin_c *tc;
517 dm_block_t virt_block; 540 dm_block_t virt_block;
518 dm_block_t data_block; 541 dm_block_t data_block;
519 struct dm_bio_prison_cell *cell, *cell2; 542 struct dm_bio_prison_cell *cell, *cell2;
520 int err;
521 543
522 /* 544 /*
523 * If the bio covers the whole area of a block then we can avoid 545 * If the bio covers the whole area of a block then we can avoid
@@ -534,7 +556,7 @@ static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
534 struct pool *pool = m->tc->pool; 556 struct pool *pool = m->tc->pool;
535 557
536 if (m->quiesced && m->prepared) { 558 if (m->quiesced && m->prepared) {
537 list_add(&m->list, &pool->prepared_mappings); 559 list_add_tail(&m->list, &pool->prepared_mappings);
538 wake_worker(pool); 560 wake_worker(pool);
539 } 561 }
540} 562}
@@ -548,7 +570,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
548 m->err = read_err || write_err ? -EIO : 0; 570 m->err = read_err || write_err ? -EIO : 0;
549 571
550 spin_lock_irqsave(&pool->lock, flags); 572 spin_lock_irqsave(&pool->lock, flags);
551 m->prepared = 1; 573 m->prepared = true;
552 __maybe_add_mapping(m); 574 __maybe_add_mapping(m);
553 spin_unlock_irqrestore(&pool->lock, flags); 575 spin_unlock_irqrestore(&pool->lock, flags);
554} 576}
@@ -563,7 +585,7 @@ static void overwrite_endio(struct bio *bio, int err)
563 m->err = err; 585 m->err = err;
564 586
565 spin_lock_irqsave(&pool->lock, flags); 587 spin_lock_irqsave(&pool->lock, flags);
566 m->prepared = 1; 588 m->prepared = true;
567 __maybe_add_mapping(m); 589 __maybe_add_mapping(m);
568 spin_unlock_irqrestore(&pool->lock, flags); 590 spin_unlock_irqrestore(&pool->lock, flags);
569} 591}
@@ -610,8 +632,10 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
610 632
611static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 633static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
612{ 634{
613 if (m->bio) 635 if (m->bio) {
614 m->bio->bi_end_io = m->saved_bi_end_io; 636 m->bio->bi_end_io = m->saved_bi_end_io;
637 atomic_inc(&m->bio->bi_remaining);
638 }
615 cell_error(m->tc->pool, m->cell); 639 cell_error(m->tc->pool, m->cell);
616 list_del(&m->list); 640 list_del(&m->list);
617 mempool_free(m, m->tc->pool->mapping_pool); 641 mempool_free(m, m->tc->pool->mapping_pool);
@@ -625,8 +649,10 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
625 int r; 649 int r;
626 650
627 bio = m->bio; 651 bio = m->bio;
628 if (bio) 652 if (bio) {
629 bio->bi_end_io = m->saved_bi_end_io; 653 bio->bi_end_io = m->saved_bi_end_io;
654 atomic_inc(&bio->bi_remaining);
655 }
630 656
631 if (m->err) { 657 if (m->err) {
632 cell_error(pool, m->cell); 658 cell_error(pool, m->cell);
@@ -640,9 +666,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
640 */ 666 */
641 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 667 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
642 if (r) { 668 if (r) {
643 DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", 669 metadata_operation_failed(pool, "dm_thin_insert_block", r);
644 dm_device_name(pool->pool_md), r);
645 set_pool_mode(pool, PM_READ_ONLY);
646 cell_error(pool, m->cell); 670 cell_error(pool, m->cell);
647 goto out; 671 goto out;
648 } 672 }
@@ -683,7 +707,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
683 cell_defer_no_holder(tc, m->cell2); 707 cell_defer_no_holder(tc, m->cell2);
684 708
685 if (m->pass_discard) 709 if (m->pass_discard)
686 remap_and_issue(tc, m->bio, m->data_block); 710 if (m->definitely_not_shared)
711 remap_and_issue(tc, m->bio, m->data_block);
712 else {
713 bool used = false;
714 if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
715 bio_endio(m->bio, 0);
716 else
717 remap_and_issue(tc, m->bio, m->data_block);
718 }
687 else 719 else
688 bio_endio(m->bio, 0); 720 bio_endio(m->bio, 0);
689 721
@@ -723,7 +755,8 @@ static void process_prepared(struct pool *pool, struct list_head *head,
723 */ 755 */
724static int io_overlaps_block(struct pool *pool, struct bio *bio) 756static int io_overlaps_block(struct pool *pool, struct bio *bio)
725{ 757{
726 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); 758 return bio->bi_iter.bi_size ==
759 (pool->sectors_per_block << SECTOR_SHIFT);
727} 760}
728 761
729static int io_overwrites_block(struct pool *pool, struct bio *bio) 762static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@ -751,13 +784,17 @@ static int ensure_next_mapping(struct pool *pool)
751 784
752static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 785static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
753{ 786{
754 struct dm_thin_new_mapping *r = pool->next_mapping; 787 struct dm_thin_new_mapping *m = pool->next_mapping;
755 788
756 BUG_ON(!pool->next_mapping); 789 BUG_ON(!pool->next_mapping);
757 790
791 memset(m, 0, sizeof(struct dm_thin_new_mapping));
792 INIT_LIST_HEAD(&m->list);
793 m->bio = NULL;
794
758 pool->next_mapping = NULL; 795 pool->next_mapping = NULL;
759 796
760 return r; 797 return m;
761} 798}
762 799
763static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 800static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -769,18 +806,13 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
769 struct pool *pool = tc->pool; 806 struct pool *pool = tc->pool;
770 struct dm_thin_new_mapping *m = get_next_mapping(pool); 807 struct dm_thin_new_mapping *m = get_next_mapping(pool);
771 808
772 INIT_LIST_HEAD(&m->list);
773 m->quiesced = 0;
774 m->prepared = 0;
775 m->tc = tc; 809 m->tc = tc;
776 m->virt_block = virt_block; 810 m->virt_block = virt_block;
777 m->data_block = data_dest; 811 m->data_block = data_dest;
778 m->cell = cell; 812 m->cell = cell;
779 m->err = 0;
780 m->bio = NULL;
781 813
782 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) 814 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
783 m->quiesced = 1; 815 m->quiesced = true;
784 816
785 /* 817 /*
786 * IO to pool_dev remaps to the pool target's data_dev. 818 * IO to pool_dev remaps to the pool target's data_dev.
@@ -840,15 +872,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
840 struct pool *pool = tc->pool; 872 struct pool *pool = tc->pool;
841 struct dm_thin_new_mapping *m = get_next_mapping(pool); 873 struct dm_thin_new_mapping *m = get_next_mapping(pool);
842 874
843 INIT_LIST_HEAD(&m->list); 875 m->quiesced = true;
844 m->quiesced = 1; 876 m->prepared = false;
845 m->prepared = 0;
846 m->tc = tc; 877 m->tc = tc;
847 m->virt_block = virt_block; 878 m->virt_block = virt_block;
848 m->data_block = data_block; 879 m->data_block = data_block;
849 m->cell = cell; 880 m->cell = cell;
850 m->err = 0;
851 m->bio = NULL;
852 881
853 /* 882 /*
854 * If the whole block of data is being overwritten or we are not 883 * If the whole block of data is being overwritten or we are not
@@ -895,41 +924,44 @@ static int commit(struct pool *pool)
895 return -EINVAL; 924 return -EINVAL;
896 925
897 r = dm_pool_commit_metadata(pool->pmd); 926 r = dm_pool_commit_metadata(pool->pmd);
898 if (r) { 927 if (r)
899 DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", 928 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
900 dm_device_name(pool->pool_md), r);
901 set_pool_mode(pool, PM_READ_ONLY);
902 }
903 929
904 return r; 930 return r;
905} 931}
906 932
907static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 933static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
908{ 934{
909 int r;
910 dm_block_t free_blocks;
911 unsigned long flags; 935 unsigned long flags;
912 struct pool *pool = tc->pool;
913
914 /*
915 * Once no_free_space is set we must not allow allocation to succeed.
916 * Otherwise it is difficult to explain, debug, test and support.
917 */
918 if (pool->no_free_space)
919 return -ENOSPC;
920
921 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
922 if (r)
923 return r;
924 936
925 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 937 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
926 DMWARN("%s: reached low water mark for data device: sending event.", 938 DMWARN("%s: reached low water mark for data device: sending event.",
927 dm_device_name(pool->pool_md)); 939 dm_device_name(pool->pool_md));
928 spin_lock_irqsave(&pool->lock, flags); 940 spin_lock_irqsave(&pool->lock, flags);
929 pool->low_water_triggered = 1; 941 pool->low_water_triggered = true;
930 spin_unlock_irqrestore(&pool->lock, flags); 942 spin_unlock_irqrestore(&pool->lock, flags);
931 dm_table_event(pool->ti->table); 943 dm_table_event(pool->ti->table);
932 } 944 }
945}
946
947static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
948
949static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
950{
951 int r;
952 dm_block_t free_blocks;
953 struct pool *pool = tc->pool;
954
955 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
956 return -EINVAL;
957
958 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
959 if (r) {
960 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
961 return r;
962 }
963
964 check_low_water_mark(pool, free_blocks);
933 965
934 if (!free_blocks) { 966 if (!free_blocks) {
935 /* 967 /*
@@ -941,35 +973,20 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
941 return r; 973 return r;
942 974
943 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 975 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
944 if (r) 976 if (r) {
977 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
945 return r; 978 return r;
979 }
946 980
947 /*
948 * If we still have no space we set a flag to avoid
949 * doing all this checking and return -ENOSPC. This
950 * flag serves as a latch that disallows allocations from
951 * this pool until the admin takes action (e.g. resize or
952 * table reload).
953 */
954 if (!free_blocks) { 981 if (!free_blocks) {
955 DMWARN("%s: no free data space available.", 982 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
956 dm_device_name(pool->pool_md));
957 spin_lock_irqsave(&pool->lock, flags);
958 pool->no_free_space = 1;
959 spin_unlock_irqrestore(&pool->lock, flags);
960 return -ENOSPC; 983 return -ENOSPC;
961 } 984 }
962 } 985 }
963 986
964 r = dm_pool_alloc_data_block(pool->pmd, result); 987 r = dm_pool_alloc_data_block(pool->pmd, result);
965 if (r) { 988 if (r) {
966 if (r == -ENOSPC && 989 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
967 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
968 !free_blocks) {
969 DMWARN("%s: no free metadata space available.",
970 dm_device_name(pool->pool_md));
971 set_pool_mode(pool, PM_READ_ONLY);
972 }
973 return r; 990 return r;
974 } 991 }
975 992
@@ -992,16 +1009,56 @@ static void retry_on_resume(struct bio *bio)
992 spin_unlock_irqrestore(&pool->lock, flags); 1009 spin_unlock_irqrestore(&pool->lock, flags);
993} 1010}
994 1011
995static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell) 1012static bool should_error_unserviceable_bio(struct pool *pool)
1013{
1014 enum pool_mode m = get_pool_mode(pool);
1015
1016 switch (m) {
1017 case PM_WRITE:
1018 /* Shouldn't get here */
1019 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1020 return true;
1021
1022 case PM_OUT_OF_DATA_SPACE:
1023 return pool->pf.error_if_no_space;
1024
1025 case PM_READ_ONLY:
1026 case PM_FAIL:
1027 return true;
1028 default:
1029 /* Shouldn't get here */
1030 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1031 return true;
1032 }
1033}
1034
1035static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1036{
1037 if (should_error_unserviceable_bio(pool))
1038 bio_io_error(bio);
1039 else
1040 retry_on_resume(bio);
1041}
1042
1043static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
996{ 1044{
997 struct bio *bio; 1045 struct bio *bio;
998 struct bio_list bios; 1046 struct bio_list bios;
999 1047
1048 if (should_error_unserviceable_bio(pool)) {
1049 cell_error(pool, cell);
1050 return;
1051 }
1052
1000 bio_list_init(&bios); 1053 bio_list_init(&bios);
1001 cell_release(pool, cell, &bios); 1054 cell_release(pool, cell, &bios);
1002 1055
1003 while ((bio = bio_list_pop(&bios))) 1056 if (should_error_unserviceable_bio(pool))
1004 retry_on_resume(bio); 1057 while ((bio = bio_list_pop(&bios)))
1058 bio_io_error(bio);
1059 else
1060 while ((bio = bio_list_pop(&bios)))
1061 retry_on_resume(bio);
1005} 1062}
1006 1063
1007static void process_discard(struct thin_c *tc, struct bio *bio) 1064static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1040,17 +1097,17 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1040 */ 1097 */
1041 m = get_next_mapping(pool); 1098 m = get_next_mapping(pool);
1042 m->tc = tc; 1099 m->tc = tc;
1043 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; 1100 m->pass_discard = pool->pf.discard_passdown;
1101 m->definitely_not_shared = !lookup_result.shared;
1044 m->virt_block = block; 1102 m->virt_block = block;
1045 m->data_block = lookup_result.block; 1103 m->data_block = lookup_result.block;
1046 m->cell = cell; 1104 m->cell = cell;
1047 m->cell2 = cell2; 1105 m->cell2 = cell2;
1048 m->err = 0;
1049 m->bio = bio; 1106 m->bio = bio;
1050 1107
1051 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 1108 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
1052 spin_lock_irqsave(&pool->lock, flags); 1109 spin_lock_irqsave(&pool->lock, flags);
1053 list_add(&m->list, &pool->prepared_discards); 1110 list_add_tail(&m->list, &pool->prepared_discards);
1054 spin_unlock_irqrestore(&pool->lock, flags); 1111 spin_unlock_irqrestore(&pool->lock, flags);
1055 wake_worker(pool); 1112 wake_worker(pool);
1056 } 1113 }
@@ -1105,13 +1162,12 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1105 break; 1162 break;
1106 1163
1107 case -ENOSPC: 1164 case -ENOSPC:
1108 no_space(pool, cell); 1165 retry_bios_on_resume(pool, cell);
1109 break; 1166 break;
1110 1167
1111 default: 1168 default:
1112 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1169 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1113 __func__, r); 1170 __func__, r);
1114 set_pool_mode(pool, PM_READ_ONLY);
1115 cell_error(pool, cell); 1171 cell_error(pool, cell);
1116 break; 1172 break;
1117 } 1173 }
@@ -1133,7 +1189,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1133 if (bio_detain(pool, &key, bio, &cell)) 1189 if (bio_detain(pool, &key, bio, &cell))
1134 return; 1190 return;
1135 1191
1136 if (bio_data_dir(bio) == WRITE && bio->bi_size) 1192 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
1137 break_sharing(tc, bio, block, &key, lookup_result, cell); 1193 break_sharing(tc, bio, block, &key, lookup_result, cell);
1138 else { 1194 else {
1139 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1195 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1156,7 +1212,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1156 /* 1212 /*
1157 * Remap empty bios (flushes) immediately, without provisioning. 1213 * Remap empty bios (flushes) immediately, without provisioning.
1158 */ 1214 */
1159 if (!bio->bi_size) { 1215 if (!bio->bi_iter.bi_size) {
1160 inc_all_io_entry(pool, bio); 1216 inc_all_io_entry(pool, bio);
1161 cell_defer_no_holder(tc, cell); 1217 cell_defer_no_holder(tc, cell);
1162 1218
@@ -1184,13 +1240,12 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1184 break; 1240 break;
1185 1241
1186 case -ENOSPC: 1242 case -ENOSPC:
1187 no_space(pool, cell); 1243 retry_bios_on_resume(pool, cell);
1188 break; 1244 break;
1189 1245
1190 default: 1246 default:
1191 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1247 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1192 __func__, r); 1248 __func__, r);
1193 set_pool_mode(pool, PM_READ_ONLY);
1194 cell_error(pool, cell); 1249 cell_error(pool, cell);
1195 break; 1250 break;
1196 } 1251 }
@@ -1256,8 +1311,8 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1256 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1311 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1257 switch (r) { 1312 switch (r) {
1258 case 0: 1313 case 0:
1259 if (lookup_result.shared && (rw == WRITE) && bio->bi_size) 1314 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
1260 bio_io_error(bio); 1315 handle_unserviceable_bio(tc->pool, bio);
1261 else { 1316 else {
1262 inc_all_io_entry(tc->pool, bio); 1317 inc_all_io_entry(tc->pool, bio);
1263 remap_and_issue(tc, bio, lookup_result.block); 1318 remap_and_issue(tc, bio, lookup_result.block);
@@ -1266,7 +1321,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1266 1321
1267 case -ENODATA: 1322 case -ENODATA:
1268 if (rw != READ) { 1323 if (rw != READ) {
1269 bio_io_error(bio); 1324 handle_unserviceable_bio(tc->pool, bio);
1270 break; 1325 break;
1271 } 1326 }
1272 1327
@@ -1288,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1288 } 1343 }
1289} 1344}
1290 1345
1346static void process_bio_success(struct thin_c *tc, struct bio *bio)
1347{
1348 bio_endio(bio, 0);
1349}
1350
1291static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1351static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1292{ 1352{
1293 bio_io_error(bio); 1353 bio_io_error(bio);
@@ -1320,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
1320 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1380 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1321 struct thin_c *tc = h->tc; 1381 struct thin_c *tc = h->tc;
1322 1382
1383 if (tc->requeue_mode) {
1384 bio_endio(bio, DM_ENDIO_REQUEUE);
1385 continue;
1386 }
1387
1323 /* 1388 /*
1324 * If we've got no free new_mapping structs, and processing 1389 * If we've got no free new_mapping structs, and processing
1325 * this bio might require one, we pause until there are some 1390 * this bio might require one, we pause until there are some
@@ -1349,7 +1414,8 @@ static void process_deferred_bios(struct pool *pool)
1349 bio_list_init(&pool->deferred_flush_bios); 1414 bio_list_init(&pool->deferred_flush_bios);
1350 spin_unlock_irqrestore(&pool->lock, flags); 1415 spin_unlock_irqrestore(&pool->lock, flags);
1351 1416
1352 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1417 if (bio_list_empty(&bios) &&
1418 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
1353 return; 1419 return;
1354 1420
1355 if (commit(pool)) { 1421 if (commit(pool)) {
@@ -1385,46 +1451,134 @@ static void do_waker(struct work_struct *ws)
1385 1451
1386/*----------------------------------------------------------------*/ 1452/*----------------------------------------------------------------*/
1387 1453
1454struct noflush_work {
1455 struct work_struct worker;
1456 struct thin_c *tc;
1457
1458 atomic_t complete;
1459 wait_queue_head_t wait;
1460};
1461
1462static void complete_noflush_work(struct noflush_work *w)
1463{
1464 atomic_set(&w->complete, 1);
1465 wake_up(&w->wait);
1466}
1467
1468static void do_noflush_start(struct work_struct *ws)
1469{
1470 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1471 w->tc->requeue_mode = true;
1472 requeue_io(w->tc);
1473 complete_noflush_work(w);
1474}
1475
1476static void do_noflush_stop(struct work_struct *ws)
1477{
1478 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1479 w->tc->requeue_mode = false;
1480 complete_noflush_work(w);
1481}
1482
1483static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1484{
1485 struct noflush_work w;
1486
1487 INIT_WORK(&w.worker, fn);
1488 w.tc = tc;
1489 atomic_set(&w.complete, 0);
1490 init_waitqueue_head(&w.wait);
1491
1492 queue_work(tc->pool->wq, &w.worker);
1493
1494 wait_event(w.wait, atomic_read(&w.complete));
1495}
1496
1497/*----------------------------------------------------------------*/
1498
1388static enum pool_mode get_pool_mode(struct pool *pool) 1499static enum pool_mode get_pool_mode(struct pool *pool)
1389{ 1500{
1390 return pool->pf.mode; 1501 return pool->pf.mode;
1391} 1502}
1392 1503
1393static void set_pool_mode(struct pool *pool, enum pool_mode mode) 1504static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1394{ 1505{
1395 int r; 1506 dm_table_event(pool->ti->table);
1507 DMINFO("%s: switching pool to %s mode",
1508 dm_device_name(pool->pool_md), new_mode);
1509}
1396 1510
1397 pool->pf.mode = mode; 1511static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1512{
1513 struct pool_c *pt = pool->ti->private;
1514 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1515 enum pool_mode old_mode = get_pool_mode(pool);
1398 1516
1399 switch (mode) { 1517 /*
1400 case PM_FAIL: 1518 * Never allow the pool to transition to PM_WRITE mode if user
1401 DMERR("%s: switching pool to failure mode", 1519 * intervention is required to verify metadata and data consistency.
1520 */
1521 if (new_mode == PM_WRITE && needs_check) {
1522 DMERR("%s: unable to switch pool to write mode until repaired.",
1402 dm_device_name(pool->pool_md)); 1523 dm_device_name(pool->pool_md));
1524 if (old_mode != new_mode)
1525 new_mode = old_mode;
1526 else
1527 new_mode = PM_READ_ONLY;
1528 }
1529 /*
1530 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1531 * not going to recover without a thin_repair. So we never let the
1532 * pool move out of the old mode.
1533 */
1534 if (old_mode == PM_FAIL)
1535 new_mode = old_mode;
1536
1537 switch (new_mode) {
1538 case PM_FAIL:
1539 if (old_mode != new_mode)
1540 notify_of_pool_mode_change(pool, "failure");
1403 dm_pool_metadata_read_only(pool->pmd); 1541 dm_pool_metadata_read_only(pool->pmd);
1404 pool->process_bio = process_bio_fail; 1542 pool->process_bio = process_bio_fail;
1405 pool->process_discard = process_bio_fail; 1543 pool->process_discard = process_bio_fail;
1406 pool->process_prepared_mapping = process_prepared_mapping_fail; 1544 pool->process_prepared_mapping = process_prepared_mapping_fail;
1407 pool->process_prepared_discard = process_prepared_discard_fail; 1545 pool->process_prepared_discard = process_prepared_discard_fail;
1546
1547 error_retry_list(pool);
1408 break; 1548 break;
1409 1549
1410 case PM_READ_ONLY: 1550 case PM_READ_ONLY:
1411 DMERR("%s: switching pool to read-only mode", 1551 if (old_mode != new_mode)
1412 dm_device_name(pool->pool_md)); 1552 notify_of_pool_mode_change(pool, "read-only");
1413 r = dm_pool_abort_metadata(pool->pmd); 1553 dm_pool_metadata_read_only(pool->pmd);
1414 if (r) { 1554 pool->process_bio = process_bio_read_only;
1415 DMERR("%s: aborting transaction failed", 1555 pool->process_discard = process_bio_success;
1416 dm_device_name(pool->pool_md)); 1556 pool->process_prepared_mapping = process_prepared_mapping_fail;
1417 set_pool_mode(pool, PM_FAIL); 1557 pool->process_prepared_discard = process_prepared_discard_passdown;
1418 } else { 1558
1419 dm_pool_metadata_read_only(pool->pmd); 1559 error_retry_list(pool);
1420 pool->process_bio = process_bio_read_only; 1560 break;
1421 pool->process_discard = process_discard; 1561
1422 pool->process_prepared_mapping = process_prepared_mapping_fail; 1562 case PM_OUT_OF_DATA_SPACE:
1423 pool->process_prepared_discard = process_prepared_discard_passdown; 1563 /*
1424 } 1564 * Ideally we'd never hit this state; the low water mark
1565 * would trigger userland to extend the pool before we
1566 * completely run out of data space. However, many small
1567 * IOs to unprovisioned space can consume data space at an
1568 * alarming rate. Adjust your low water mark if you're
1569 * frequently seeing this mode.
1570 */
1571 if (old_mode != new_mode)
1572 notify_of_pool_mode_change(pool, "out-of-data-space");
1573 pool->process_bio = process_bio_read_only;
1574 pool->process_discard = process_discard;
1575 pool->process_prepared_mapping = process_prepared_mapping;
1576 pool->process_prepared_discard = process_prepared_discard_passdown;
1425 break; 1577 break;
1426 1578
1427 case PM_WRITE: 1579 case PM_WRITE:
1580 if (old_mode != new_mode)
1581 notify_of_pool_mode_change(pool, "write");
1428 dm_pool_metadata_read_write(pool->pmd); 1582 dm_pool_metadata_read_write(pool->pmd);
1429 pool->process_bio = process_bio; 1583 pool->process_bio = process_bio;
1430 pool->process_discard = process_discard; 1584 pool->process_discard = process_discard;
@@ -1432,6 +1586,38 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1432 pool->process_prepared_discard = process_prepared_discard; 1586 pool->process_prepared_discard = process_prepared_discard;
1433 break; 1587 break;
1434 } 1588 }
1589
1590 pool->pf.mode = new_mode;
1591 /*
1592 * The pool mode may have changed, sync it so bind_control_target()
1593 * doesn't cause an unexpected mode transition on resume.
1594 */
1595 pt->adjusted_pf.mode = new_mode;
1596}
1597
1598static void abort_transaction(struct pool *pool)
1599{
1600 const char *dev_name = dm_device_name(pool->pool_md);
1601
1602 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1603 if (dm_pool_abort_metadata(pool->pmd)) {
1604 DMERR("%s: failed to abort metadata transaction", dev_name);
1605 set_pool_mode(pool, PM_FAIL);
1606 }
1607
1608 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1609 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1610 set_pool_mode(pool, PM_FAIL);
1611 }
1612}
1613
1614static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1615{
1616 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1617 dm_device_name(pool->pool_md), op, r);
1618
1619 abort_transaction(pool);
1620 set_pool_mode(pool, PM_READ_ONLY);
1435} 1621}
1436 1622
1437/*----------------------------------------------------------------*/ 1623/*----------------------------------------------------------------*/
@@ -1481,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1481 1667
1482 thin_hook_bio(tc, bio); 1668 thin_hook_bio(tc, bio);
1483 1669
1670 if (tc->requeue_mode) {
1671 bio_endio(bio, DM_ENDIO_REQUEUE);
1672 return DM_MAPIO_SUBMITTED;
1673 }
1674
1484 if (get_pool_mode(tc->pool) == PM_FAIL) { 1675 if (get_pool_mode(tc->pool) == PM_FAIL) {
1485 bio_io_error(bio); 1676 bio_io_error(bio);
1486 return DM_MAPIO_SUBMITTED; 1677 return DM_MAPIO_SUBMITTED;
@@ -1538,9 +1729,9 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1538 if (get_pool_mode(tc->pool) == PM_READ_ONLY) { 1729 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1539 /* 1730 /*
1540 * This block isn't provisioned, and we have no way 1731 * This block isn't provisioned, and we have no way
1541 * of doing so. Just error it. 1732 * of doing so.
1542 */ 1733 */
1543 bio_io_error(bio); 1734 handle_unserviceable_bio(tc->pool, bio);
1544 return DM_MAPIO_SUBMITTED; 1735 return DM_MAPIO_SUBMITTED;
1545 } 1736 }
1546 /* fall through */ 1737 /* fall through */
@@ -1644,22 +1835,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1644 /* 1835 /*
1645 * We want to make sure that a pool in PM_FAIL mode is never upgraded. 1836 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
1646 */ 1837 */
1647 enum pool_mode old_mode = pool->pf.mode; 1838 enum pool_mode old_mode = get_pool_mode(pool);
1648 enum pool_mode new_mode = pt->adjusted_pf.mode; 1839 enum pool_mode new_mode = pt->adjusted_pf.mode;
1649 1840
1650 /* 1841 /*
1651 * If we were in PM_FAIL mode, rollback of metadata failed. We're 1842 * Don't change the pool's mode until set_pool_mode() below.
1652 * not going to recover without a thin_repair. So we never let the 1843 * Otherwise the pool's process_* function pointers may
1653 * pool move out of the old mode. On the other hand a PM_READ_ONLY 1844 * not match the desired pool mode.
1654 * may have been due to a lack of metadata or data space, and may
1655 * now work (ie. if the underlying devices have been resized).
1656 */ 1845 */
1657 if (old_mode == PM_FAIL) 1846 pt->adjusted_pf.mode = old_mode;
1658 new_mode = old_mode;
1659 1847
1660 pool->ti = ti; 1848 pool->ti = ti;
1661 pool->low_water_blocks = pt->low_water_blocks;
1662 pool->pf = pt->adjusted_pf; 1849 pool->pf = pt->adjusted_pf;
1850 pool->low_water_blocks = pt->low_water_blocks;
1663 1851
1664 set_pool_mode(pool, new_mode); 1852 set_pool_mode(pool, new_mode);
1665 1853
@@ -1682,6 +1870,7 @@ static void pool_features_init(struct pool_features *pf)
1682 pf->zero_new_blocks = true; 1870 pf->zero_new_blocks = true;
1683 pf->discard_enabled = true; 1871 pf->discard_enabled = true;
1684 pf->discard_passdown = true; 1872 pf->discard_passdown = true;
1873 pf->error_if_no_space = false;
1685} 1874}
1686 1875
1687static void __pool_destroy(struct pool *pool) 1876static void __pool_destroy(struct pool *pool)
@@ -1772,8 +1961,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1772 bio_list_init(&pool->deferred_flush_bios); 1961 bio_list_init(&pool->deferred_flush_bios);
1773 INIT_LIST_HEAD(&pool->prepared_mappings); 1962 INIT_LIST_HEAD(&pool->prepared_mappings);
1774 INIT_LIST_HEAD(&pool->prepared_discards); 1963 INIT_LIST_HEAD(&pool->prepared_discards);
1775 pool->low_water_triggered = 0; 1964 pool->low_water_triggered = false;
1776 pool->no_free_space = 0;
1777 bio_list_init(&pool->retry_on_resume_list); 1965 bio_list_init(&pool->retry_on_resume_list);
1778 1966
1779 pool->shared_read_ds = dm_deferred_set_create(); 1967 pool->shared_read_ds = dm_deferred_set_create();
@@ -1898,7 +2086,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1898 const char *arg_name; 2086 const char *arg_name;
1899 2087
1900 static struct dm_arg _args[] = { 2088 static struct dm_arg _args[] = {
1901 {0, 3, "Invalid number of pool feature arguments"}, 2089 {0, 4, "Invalid number of pool feature arguments"},
1902 }; 2090 };
1903 2091
1904 /* 2092 /*
@@ -1927,6 +2115,9 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1927 else if (!strcasecmp(arg_name, "read_only")) 2115 else if (!strcasecmp(arg_name, "read_only"))
1928 pf->mode = PM_READ_ONLY; 2116 pf->mode = PM_READ_ONLY;
1929 2117
2118 else if (!strcasecmp(arg_name, "error_if_no_space"))
2119 pf->error_if_no_space = true;
2120
1930 else { 2121 else {
1931 ti->error = "Unrecognised pool feature requested"; 2122 ti->error = "Unrecognised pool feature requested";
1932 r = -EINVAL; 2123 r = -EINVAL;
@@ -1947,16 +2138,27 @@ static void metadata_low_callback(void *context)
1947 dm_table_event(pool->ti->table); 2138 dm_table_event(pool->ti->table);
1948} 2139}
1949 2140
1950static sector_t get_metadata_dev_size(struct block_device *bdev) 2141static sector_t get_dev_size(struct block_device *bdev)
1951{ 2142{
1952 sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 2143 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2144}
2145
2146static void warn_if_metadata_device_too_big(struct block_device *bdev)
2147{
2148 sector_t metadata_dev_size = get_dev_size(bdev);
1953 char buffer[BDEVNAME_SIZE]; 2149 char buffer[BDEVNAME_SIZE];
1954 2150
1955 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { 2151 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1956 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2152 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1957 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); 2153 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
1958 metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; 2154}
1959 } 2155
2156static sector_t get_metadata_dev_size(struct block_device *bdev)
2157{
2158 sector_t metadata_dev_size = get_dev_size(bdev);
2159
2160 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2161 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
1960 2162
1961 return metadata_dev_size; 2163 return metadata_dev_size;
1962} 2164}
@@ -1965,7 +2167,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
1965{ 2167{
1966 sector_t metadata_dev_size = get_metadata_dev_size(bdev); 2168 sector_t metadata_dev_size = get_metadata_dev_size(bdev);
1967 2169
1968 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 2170 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
1969 2171
1970 return metadata_dev_size; 2172 return metadata_dev_size;
1971} 2173}
@@ -1997,6 +2199,8 @@ static dm_block_t calc_metadata_threshold(struct pool_c *pt)
1997 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 2199 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1998 * ignore_discard: disable discard 2200 * ignore_discard: disable discard
1999 * no_discard_passdown: don't pass discards down to the data device 2201 * no_discard_passdown: don't pass discards down to the data device
2202 * read_only: Don't allow any changes to be made to the pool metadata.
2203 * error_if_no_space: error IOs, instead of queueing, if no space.
2000 */ 2204 */
2001static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 2205static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2002{ 2206{
@@ -2041,12 +2245,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2041 ti->error = "Error opening metadata block device"; 2245 ti->error = "Error opening metadata block device";
2042 goto out_unlock; 2246 goto out_unlock;
2043 } 2247 }
2044 2248 warn_if_metadata_device_too_big(metadata_dev->bdev);
2045 /*
2046 * Run for the side-effect of possibly issuing a warning if the
2047 * device is too big.
2048 */
2049 (void) get_metadata_dev_size(metadata_dev->bdev);
2050 2249
2051 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 2250 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2052 if (r) { 2251 if (r) {
@@ -2192,11 +2391,19 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2192 return -EINVAL; 2391 return -EINVAL;
2193 2392
2194 } else if (data_size > sb_data_size) { 2393 } else if (data_size > sb_data_size) {
2394 if (dm_pool_metadata_needs_check(pool->pmd)) {
2395 DMERR("%s: unable to grow the data device until repaired.",
2396 dm_device_name(pool->pool_md));
2397 return 0;
2398 }
2399
2400 if (sb_data_size)
2401 DMINFO("%s: growing the data device from %llu to %llu blocks",
2402 dm_device_name(pool->pool_md),
2403 sb_data_size, (unsigned long long)data_size);
2195 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2404 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2196 if (r) { 2405 if (r) {
2197 DMERR("%s: failed to resize data device", 2406 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
2198 dm_device_name(pool->pool_md));
2199 set_pool_mode(pool, PM_READ_ONLY);
2200 return r; 2407 return r;
2201 } 2408 }
2202 2409
@@ -2231,10 +2438,19 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2231 return -EINVAL; 2438 return -EINVAL;
2232 2439
2233 } else if (metadata_dev_size > sb_metadata_dev_size) { 2440 } else if (metadata_dev_size > sb_metadata_dev_size) {
2441 if (dm_pool_metadata_needs_check(pool->pmd)) {
2442 DMERR("%s: unable to grow the metadata device until repaired.",
2443 dm_device_name(pool->pool_md));
2444 return 0;
2445 }
2446
2447 warn_if_metadata_device_too_big(pool->md_dev);
2448 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2449 dm_device_name(pool->pool_md),
2450 sb_metadata_dev_size, metadata_dev_size);
2234 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); 2451 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2235 if (r) { 2452 if (r) {
2236 DMERR("%s: failed to resize metadata device", 2453 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
2237 dm_device_name(pool->pool_md));
2238 return r; 2454 return r;
2239 } 2455 }
2240 2456
@@ -2290,8 +2506,7 @@ static void pool_resume(struct dm_target *ti)
2290 unsigned long flags; 2506 unsigned long flags;
2291 2507
2292 spin_lock_irqsave(&pool->lock, flags); 2508 spin_lock_irqsave(&pool->lock, flags);
2293 pool->low_water_triggered = 0; 2509 pool->low_water_triggered = false;
2294 pool->no_free_space = 0;
2295 __requeue_bios(pool); 2510 __requeue_bios(pool);
2296 spin_unlock_irqrestore(&pool->lock, flags); 2511 spin_unlock_irqrestore(&pool->lock, flags);
2297 2512
@@ -2510,7 +2725,8 @@ static void emit_flags(struct pool_features *pf, char *result,
2510 unsigned sz, unsigned maxlen) 2725 unsigned sz, unsigned maxlen)
2511{ 2726{
2512 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + 2727 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2513 !pf->discard_passdown + (pf->mode == PM_READ_ONLY); 2728 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
2729 pf->error_if_no_space;
2514 DMEMIT("%u ", count); 2730 DMEMIT("%u ", count);
2515 2731
2516 if (!pf->zero_new_blocks) 2732 if (!pf->zero_new_blocks)
@@ -2524,6 +2740,9 @@ static void emit_flags(struct pool_features *pf, char *result,
2524 2740
2525 if (pf->mode == PM_READ_ONLY) 2741 if (pf->mode == PM_READ_ONLY)
2526 DMEMIT("read_only "); 2742 DMEMIT("read_only ");
2743
2744 if (pf->error_if_no_space)
2745 DMEMIT("error_if_no_space ");
2527} 2746}
2528 2747
2529/* 2748/*
@@ -2612,17 +2831,24 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2612 else 2831 else
2613 DMEMIT("- "); 2832 DMEMIT("- ");
2614 2833
2615 if (pool->pf.mode == PM_READ_ONLY) 2834 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
2835 DMEMIT("out_of_data_space ");
2836 else if (pool->pf.mode == PM_READ_ONLY)
2616 DMEMIT("ro "); 2837 DMEMIT("ro ");
2617 else 2838 else
2618 DMEMIT("rw "); 2839 DMEMIT("rw ");
2619 2840
2620 if (!pool->pf.discard_enabled) 2841 if (!pool->pf.discard_enabled)
2621 DMEMIT("ignore_discard"); 2842 DMEMIT("ignore_discard ");
2622 else if (pool->pf.discard_passdown) 2843 else if (pool->pf.discard_passdown)
2623 DMEMIT("discard_passdown"); 2844 DMEMIT("discard_passdown ");
2624 else 2845 else
2625 DMEMIT("no_discard_passdown"); 2846 DMEMIT("no_discard_passdown ");
2847
2848 if (pool->pf.error_if_no_space)
2849 DMEMIT("error_if_no_space ");
2850 else
2851 DMEMIT("queue_if_no_space ");
2626 2852
2627 break; 2853 break;
2628 2854
@@ -2721,7 +2947,7 @@ static struct target_type pool_target = {
2721 .name = "thin-pool", 2947 .name = "thin-pool",
2722 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2948 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2723 DM_TARGET_IMMUTABLE, 2949 DM_TARGET_IMMUTABLE,
2724 .version = {1, 9, 0}, 2950 .version = {1, 11, 0},
2725 .module = THIS_MODULE, 2951 .module = THIS_MODULE,
2726 .ctr = pool_ctr, 2952 .ctr = pool_ctr,
2727 .dtr = pool_dtr, 2953 .dtr = pool_dtr,
@@ -2828,6 +3054,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2828 3054
2829 if (get_pool_mode(tc->pool) == PM_FAIL) { 3055 if (get_pool_mode(tc->pool) == PM_FAIL) {
2830 ti->error = "Couldn't open thin device, Pool is in fail mode"; 3056 ti->error = "Couldn't open thin device, Pool is in fail mode";
3057 r = -EINVAL;
2831 goto bad_thin_open; 3058 goto bad_thin_open;
2832 } 3059 }
2833 3060
@@ -2839,7 +3066,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2839 3066
2840 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 3067 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2841 if (r) 3068 if (r)
2842 goto bad_thin_open; 3069 goto bad_target_max_io_len;
2843 3070
2844 ti->num_flush_bios = 1; 3071 ti->num_flush_bios = 1;
2845 ti->flush_supported = true; 3072 ti->flush_supported = true;
@@ -2860,6 +3087,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2860 3087
2861 return 0; 3088 return 0;
2862 3089
3090bad_target_max_io_len:
3091 dm_pool_close_thin_device(tc->td);
2863bad_thin_open: 3092bad_thin_open:
2864 __pool_dec(tc->pool); 3093 __pool_dec(tc->pool);
2865bad_pool_lookup: 3094bad_pool_lookup:
@@ -2879,7 +3108,7 @@ out_unlock:
2879 3108
2880static int thin_map(struct dm_target *ti, struct bio *bio) 3109static int thin_map(struct dm_target *ti, struct bio *bio)
2881{ 3110{
2882 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 3111 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
2883 3112
2884 return thin_bio_map(ti, bio); 3113 return thin_bio_map(ti, bio);
2885} 3114}
@@ -2899,7 +3128,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2899 spin_lock_irqsave(&pool->lock, flags); 3128 spin_lock_irqsave(&pool->lock, flags);
2900 list_for_each_entry_safe(m, tmp, &work, list) { 3129 list_for_each_entry_safe(m, tmp, &work, list) {
2901 list_del(&m->list); 3130 list_del(&m->list);
2902 m->quiesced = 1; 3131 m->quiesced = true;
2903 __maybe_add_mapping(m); 3132 __maybe_add_mapping(m);
2904 } 3133 }
2905 spin_unlock_irqrestore(&pool->lock, flags); 3134 spin_unlock_irqrestore(&pool->lock, flags);
@@ -2911,7 +3140,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2911 if (!list_empty(&work)) { 3140 if (!list_empty(&work)) {
2912 spin_lock_irqsave(&pool->lock, flags); 3141 spin_lock_irqsave(&pool->lock, flags);
2913 list_for_each_entry_safe(m, tmp, &work, list) 3142 list_for_each_entry_safe(m, tmp, &work, list)
2914 list_add(&m->list, &pool->prepared_discards); 3143 list_add_tail(&m->list, &pool->prepared_discards);
2915 spin_unlock_irqrestore(&pool->lock, flags); 3144 spin_unlock_irqrestore(&pool->lock, flags);
2916 wake_worker(pool); 3145 wake_worker(pool);
2917 } 3146 }
@@ -2920,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2920 return 0; 3149 return 0;
2921} 3150}
2922 3151
2923static void thin_postsuspend(struct dm_target *ti) 3152static void thin_presuspend(struct dm_target *ti)
2924{ 3153{
3154 struct thin_c *tc = ti->private;
3155
2925 if (dm_noflush_suspending(ti)) 3156 if (dm_noflush_suspending(ti))
2926 requeue_io((struct thin_c *)ti->private); 3157 noflush_work(tc, do_noflush_start);
3158}
3159
3160static void thin_postsuspend(struct dm_target *ti)
3161{
3162 struct thin_c *tc = ti->private;
3163
3164 /*
3165 * The dm_noflush_suspending flag has been cleared by now, so
3166 * unfortunately we must always run this.
3167 */
3168 noflush_work(tc, do_noflush_stop);
2927} 3169}
2928 3170
2929/* 3171/*
@@ -3008,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
3008 3250
3009static struct target_type thin_target = { 3251static struct target_type thin_target = {
3010 .name = "thin", 3252 .name = "thin",
3011 .version = {1, 9, 0}, 3253 .version = {1, 11, 0},
3012 .module = THIS_MODULE, 3254 .module = THIS_MODULE,
3013 .ctr = thin_ctr, 3255 .ctr = thin_ctr,
3014 .dtr = thin_dtr, 3256 .dtr = thin_dtr,
3015 .map = thin_map, 3257 .map = thin_map,
3016 .end_io = thin_endio, 3258 .end_io = thin_endio,
3259 .presuspend = thin_presuspend,
3017 .postsuspend = thin_postsuspend, 3260 .postsuspend = thin_postsuspend,
3018 .status = thin_status, 3261 .status = thin_status,
3019 .iterate_devices = thin_iterate_devices, 3262 .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 4b7941db3aff..796007a5e0e1 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -73,15 +73,10 @@ struct dm_verity_io {
73 sector_t block; 73 sector_t block;
74 unsigned n_blocks; 74 unsigned n_blocks;
75 75
76 /* saved bio vector */ 76 struct bvec_iter iter;
77 struct bio_vec *io_vec;
78 unsigned io_vec_size;
79 77
80 struct work_struct work; 78 struct work_struct work;
81 79
82 /* A space for short vectors; longer vectors are allocated separately. */
83 struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
84
85 /* 80 /*
86 * Three variably-size fields follow this struct: 81 * Three variably-size fields follow this struct:
87 * 82 *
@@ -284,9 +279,10 @@ release_ret_r:
284static int verity_verify_io(struct dm_verity_io *io) 279static int verity_verify_io(struct dm_verity_io *io)
285{ 280{
286 struct dm_verity *v = io->v; 281 struct dm_verity *v = io->v;
282 struct bio *bio = dm_bio_from_per_bio_data(io,
283 v->ti->per_bio_data_size);
287 unsigned b; 284 unsigned b;
288 int i; 285 int i;
289 unsigned vector = 0, offset = 0;
290 286
291 for (b = 0; b < io->n_blocks; b++) { 287 for (b = 0; b < io->n_blocks; b++) {
292 struct shash_desc *desc; 288 struct shash_desc *desc;
@@ -336,31 +332,22 @@ test_block_hash:
336 } 332 }
337 333
338 todo = 1 << v->data_dev_block_bits; 334 todo = 1 << v->data_dev_block_bits;
339 do { 335 while (io->iter.bi_size) {
340 struct bio_vec *bv;
341 u8 *page; 336 u8 *page;
342 unsigned len; 337 struct bio_vec bv = bio_iter_iovec(bio, io->iter);
343 338
344 BUG_ON(vector >= io->io_vec_size); 339 page = kmap_atomic(bv.bv_page);
345 bv = &io->io_vec[vector]; 340 r = crypto_shash_update(desc, page + bv.bv_offset,
346 page = kmap_atomic(bv->bv_page); 341 bv.bv_len);
347 len = bv->bv_len - offset;
348 if (likely(len >= todo))
349 len = todo;
350 r = crypto_shash_update(desc,
351 page + bv->bv_offset + offset, len);
352 kunmap_atomic(page); 342 kunmap_atomic(page);
343
353 if (r < 0) { 344 if (r < 0) {
354 DMERR("crypto_shash_update failed: %d", r); 345 DMERR("crypto_shash_update failed: %d", r);
355 return r; 346 return r;
356 } 347 }
357 offset += len; 348
358 if (likely(offset == bv->bv_len)) { 349 bio_advance_iter(bio, &io->iter, bv.bv_len);
359 offset = 0; 350 }
360 vector++;
361 }
362 todo -= len;
363 } while (todo);
364 351
365 if (!v->version) { 352 if (!v->version) {
366 r = crypto_shash_update(desc, v->salt, v->salt_size); 353 r = crypto_shash_update(desc, v->salt, v->salt_size);
@@ -383,8 +370,6 @@ test_block_hash:
383 return -EIO; 370 return -EIO;
384 } 371 }
385 } 372 }
386 BUG_ON(vector != io->io_vec_size);
387 BUG_ON(offset);
388 373
389 return 0; 374 return 0;
390} 375}
@@ -400,10 +385,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
400 bio->bi_end_io = io->orig_bi_end_io; 385 bio->bi_end_io = io->orig_bi_end_io;
401 bio->bi_private = io->orig_bi_private; 386 bio->bi_private = io->orig_bi_private;
402 387
403 if (io->io_vec != io->io_vec_inline) 388 bio_endio_nodec(bio, error);
404 mempool_free(io->io_vec, v->vec_mempool);
405
406 bio_endio(bio, error);
407} 389}
408 390
409static void verity_work(struct work_struct *w) 391static void verity_work(struct work_struct *w)
@@ -493,9 +475,9 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
493 struct dm_verity_io *io; 475 struct dm_verity_io *io;
494 476
495 bio->bi_bdev = v->data_dev->bdev; 477 bio->bi_bdev = v->data_dev->bdev;
496 bio->bi_sector = verity_map_sector(v, bio->bi_sector); 478 bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector);
497 479
498 if (((unsigned)bio->bi_sector | bio_sectors(bio)) & 480 if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
499 ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { 481 ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
500 DMERR_LIMIT("unaligned io"); 482 DMERR_LIMIT("unaligned io");
501 return -EIO; 483 return -EIO;
@@ -514,18 +496,12 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
514 io->v = v; 496 io->v = v;
515 io->orig_bi_end_io = bio->bi_end_io; 497 io->orig_bi_end_io = bio->bi_end_io;
516 io->orig_bi_private = bio->bi_private; 498 io->orig_bi_private = bio->bi_private;
517 io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); 499 io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
518 io->n_blocks = bio->bi_size >> v->data_dev_block_bits; 500 io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
519 501
520 bio->bi_end_io = verity_end_io; 502 bio->bi_end_io = verity_end_io;
521 bio->bi_private = io; 503 bio->bi_private = io;
522 io->io_vec_size = bio_segments(bio); 504 io->iter = bio->bi_iter;
523 if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
524 io->io_vec = io->io_vec_inline;
525 else
526 io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
527 memcpy(io->io_vec, bio_iovec(bio),
528 io->io_vec_size * sizeof(struct bio_vec));
529 505
530 verity_submit_prefetch(v, io); 506 verity_submit_prefetch(v, io);
531 507
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0704c523a76b..8c53b09b9a2c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -200,8 +200,8 @@ struct mapped_device {
200 /* forced geometry settings */ 200 /* forced geometry settings */
201 struct hd_geometry geometry; 201 struct hd_geometry geometry;
202 202
203 /* sysfs handle */ 203 /* kobject and completion */
204 struct kobject kobj; 204 struct dm_kobject_holder kobj_holder;
205 205
206 /* zero-length flush that will be cloned and submitted to targets */ 206 /* zero-length flush that will be cloned and submitted to targets */
207 struct bio flush_bio; 207 struct bio flush_bio;
@@ -575,7 +575,7 @@ static void start_io_acct(struct dm_io *io)
575 atomic_inc_return(&md->pending[rw])); 575 atomic_inc_return(&md->pending[rw]));
576 576
577 if (unlikely(dm_stats_used(&md->stats))) 577 if (unlikely(dm_stats_used(&md->stats)))
578 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, 578 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
579 bio_sectors(bio), false, 0, &io->stats_aux); 579 bio_sectors(bio), false, 0, &io->stats_aux);
580} 580}
581 581
@@ -593,7 +593,7 @@ static void end_io_acct(struct dm_io *io)
593 part_stat_unlock(); 593 part_stat_unlock();
594 594
595 if (unlikely(dm_stats_used(&md->stats))) 595 if (unlikely(dm_stats_used(&md->stats)))
596 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, 596 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
597 bio_sectors(bio), true, duration, &io->stats_aux); 597 bio_sectors(bio), true, duration, &io->stats_aux);
598 598
599 /* 599 /*
@@ -742,7 +742,7 @@ static void dec_pending(struct dm_io *io, int error)
742 if (io_error == DM_ENDIO_REQUEUE) 742 if (io_error == DM_ENDIO_REQUEUE)
743 return; 743 return;
744 744
745 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { 745 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
746 /* 746 /*
747 * Preflush done for flush with data, reissue 747 * Preflush done for flush with data, reissue
748 * without REQ_FLUSH. 748 * without REQ_FLUSH.
@@ -797,7 +797,7 @@ static void end_clone_bio(struct bio *clone, int error)
797 struct dm_rq_clone_bio_info *info = clone->bi_private; 797 struct dm_rq_clone_bio_info *info = clone->bi_private;
798 struct dm_rq_target_io *tio = info->tio; 798 struct dm_rq_target_io *tio = info->tio;
799 struct bio *bio = info->orig; 799 struct bio *bio = info->orig;
800 unsigned int nr_bytes = info->orig->bi_size; 800 unsigned int nr_bytes = info->orig->bi_iter.bi_size;
801 801
802 bio_put(clone); 802 bio_put(clone);
803 803
@@ -1128,7 +1128,7 @@ static void __map_bio(struct dm_target_io *tio)
1128 * this io. 1128 * this io.
1129 */ 1129 */
1130 atomic_inc(&tio->io->io_count); 1130 atomic_inc(&tio->io->io_count);
1131 sector = clone->bi_sector; 1131 sector = clone->bi_iter.bi_sector;
1132 r = ti->type->map(ti, clone); 1132 r = ti->type->map(ti, clone);
1133 if (r == DM_MAPIO_REMAPPED) { 1133 if (r == DM_MAPIO_REMAPPED) {
1134 /* the bio has been remapped so dispatch it */ 1134 /* the bio has been remapped so dispatch it */
@@ -1155,76 +1155,32 @@ struct clone_info {
1155 struct dm_io *io; 1155 struct dm_io *io;
1156 sector_t sector; 1156 sector_t sector;
1157 sector_t sector_count; 1157 sector_t sector_count;
1158 unsigned short idx;
1159}; 1158};
1160 1159
1161static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) 1160static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
1162{ 1161{
1163 bio->bi_sector = sector; 1162 bio->bi_iter.bi_sector = sector;
1164 bio->bi_size = to_bytes(len); 1163 bio->bi_iter.bi_size = to_bytes(len);
1165}
1166
1167static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
1168{
1169 bio->bi_idx = idx;
1170 bio->bi_vcnt = idx + bv_count;
1171 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
1172}
1173
1174static void clone_bio_integrity(struct bio *bio, struct bio *clone,
1175 unsigned short idx, unsigned len, unsigned offset,
1176 unsigned trim)
1177{
1178 if (!bio_integrity(bio))
1179 return;
1180
1181 bio_integrity_clone(clone, bio, GFP_NOIO);
1182
1183 if (trim)
1184 bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
1185}
1186
1187/*
1188 * Creates a little bio that just does part of a bvec.
1189 */
1190static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
1191 sector_t sector, unsigned short idx,
1192 unsigned offset, unsigned len)
1193{
1194 struct bio *clone = &tio->clone;
1195 struct bio_vec *bv = bio->bi_io_vec + idx;
1196
1197 *clone->bi_io_vec = *bv;
1198
1199 bio_setup_sector(clone, sector, len);
1200
1201 clone->bi_bdev = bio->bi_bdev;
1202 clone->bi_rw = bio->bi_rw;
1203 clone->bi_vcnt = 1;
1204 clone->bi_io_vec->bv_offset = offset;
1205 clone->bi_io_vec->bv_len = clone->bi_size;
1206 clone->bi_flags |= 1 << BIO_CLONED;
1207
1208 clone_bio_integrity(bio, clone, idx, len, offset, 1);
1209} 1164}
1210 1165
1211/* 1166/*
1212 * Creates a bio that consists of range of complete bvecs. 1167 * Creates a bio that consists of range of complete bvecs.
1213 */ 1168 */
1214static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1169static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1215 sector_t sector, unsigned short idx, 1170 sector_t sector, unsigned len)
1216 unsigned short bv_count, unsigned len)
1217{ 1171{
1218 struct bio *clone = &tio->clone; 1172 struct bio *clone = &tio->clone;
1219 unsigned trim = 0;
1220 1173
1221 __bio_clone(clone, bio); 1174 __bio_clone_fast(clone, bio);
1222 bio_setup_sector(clone, sector, len); 1175
1223 bio_setup_bv(clone, idx, bv_count); 1176 if (bio_integrity(bio))
1177 bio_integrity_clone(clone, bio, GFP_NOIO);
1224 1178
1225 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1179 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1226 trim = 1; 1180 clone->bi_iter.bi_size = to_bytes(len);
1227 clone_bio_integrity(bio, clone, idx, len, 0, trim); 1181
1182 if (bio_integrity(bio))
1183 bio_integrity_trim(clone, 0, len);
1228} 1184}
1229 1185
1230static struct dm_target_io *alloc_tio(struct clone_info *ci, 1186static struct dm_target_io *alloc_tio(struct clone_info *ci,
@@ -1257,7 +1213,7 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
1257 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1213 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1258 * and discard, so no need for concern about wasted bvec allocations. 1214 * and discard, so no need for concern about wasted bvec allocations.
1259 */ 1215 */
1260 __bio_clone(clone, ci->bio); 1216 __bio_clone_fast(clone, ci->bio);
1261 if (len) 1217 if (len)
1262 bio_setup_sector(clone, ci->sector, len); 1218 bio_setup_sector(clone, ci->sector, len);
1263 1219
@@ -1286,10 +1242,7 @@ static int __send_empty_flush(struct clone_info *ci)
1286} 1242}
1287 1243
1288static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1244static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1289 sector_t sector, int nr_iovecs, 1245 sector_t sector, unsigned len)
1290 unsigned short idx, unsigned short bv_count,
1291 unsigned offset, unsigned len,
1292 unsigned split_bvec)
1293{ 1246{
1294 struct bio *bio = ci->bio; 1247 struct bio *bio = ci->bio;
1295 struct dm_target_io *tio; 1248 struct dm_target_io *tio;
@@ -1303,11 +1256,8 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti
1303 num_target_bios = ti->num_write_bios(ti, bio); 1256 num_target_bios = ti->num_write_bios(ti, bio);
1304 1257
1305 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1258 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1306 tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr); 1259 tio = alloc_tio(ci, ti, 0, target_bio_nr);
1307 if (split_bvec) 1260 clone_bio(tio, bio, sector, len);
1308 clone_split_bio(tio, bio, sector, idx, offset, len);
1309 else
1310 clone_bio(tio, bio, sector, idx, bv_count, len);
1311 __map_bio(tio); 1261 __map_bio(tio);
1312 } 1262 }
1313} 1263}
@@ -1379,68 +1329,13 @@ static int __send_write_same(struct clone_info *ci)
1379} 1329}
1380 1330
1381/* 1331/*
1382 * Find maximum number of sectors / bvecs we can process with a single bio.
1383 */
1384static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
1385{
1386 struct bio *bio = ci->bio;
1387 sector_t bv_len, total_len = 0;
1388
1389 for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
1390 bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
1391
1392 if (bv_len > max)
1393 break;
1394
1395 max -= bv_len;
1396 total_len += bv_len;
1397 }
1398
1399 return total_len;
1400}
1401
1402static int __split_bvec_across_targets(struct clone_info *ci,
1403 struct dm_target *ti, sector_t max)
1404{
1405 struct bio *bio = ci->bio;
1406 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1407 sector_t remaining = to_sector(bv->bv_len);
1408 unsigned offset = 0;
1409 sector_t len;
1410
1411 do {
1412 if (offset) {
1413 ti = dm_table_find_target(ci->map, ci->sector);
1414 if (!dm_target_is_valid(ti))
1415 return -EIO;
1416
1417 max = max_io_len(ci->sector, ti);
1418 }
1419
1420 len = min(remaining, max);
1421
1422 __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
1423 bv->bv_offset + offset, len, 1);
1424
1425 ci->sector += len;
1426 ci->sector_count -= len;
1427 offset += to_bytes(len);
1428 } while (remaining -= len);
1429
1430 ci->idx++;
1431
1432 return 0;
1433}
1434
1435/*
1436 * Select the correct strategy for processing a non-flush bio. 1332 * Select the correct strategy for processing a non-flush bio.
1437 */ 1333 */
1438static int __split_and_process_non_flush(struct clone_info *ci) 1334static int __split_and_process_non_flush(struct clone_info *ci)
1439{ 1335{
1440 struct bio *bio = ci->bio; 1336 struct bio *bio = ci->bio;
1441 struct dm_target *ti; 1337 struct dm_target *ti;
1442 sector_t len, max; 1338 unsigned len;
1443 int idx;
1444 1339
1445 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1340 if (unlikely(bio->bi_rw & REQ_DISCARD))
1446 return __send_discard(ci); 1341 return __send_discard(ci);
@@ -1451,41 +1346,14 @@ static int __split_and_process_non_flush(struct clone_info *ci)
1451 if (!dm_target_is_valid(ti)) 1346 if (!dm_target_is_valid(ti))
1452 return -EIO; 1347 return -EIO;
1453 1348
1454 max = max_io_len(ci->sector, ti); 1349 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1455
1456 /*
1457 * Optimise for the simple case where we can do all of
1458 * the remaining io with a single clone.
1459 */
1460 if (ci->sector_count <= max) {
1461 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1462 ci->idx, bio->bi_vcnt - ci->idx, 0,
1463 ci->sector_count, 0);
1464 ci->sector_count = 0;
1465 return 0;
1466 }
1467 1350
1468 /* 1351 __clone_and_map_data_bio(ci, ti, ci->sector, len);
1469 * There are some bvecs that don't span targets.
1470 * Do as many of these as possible.
1471 */
1472 if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1473 len = __len_within_target(ci, max, &idx);
1474
1475 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1476 ci->idx, idx - ci->idx, 0, len, 0);
1477 1352
1478 ci->sector += len; 1353 ci->sector += len;
1479 ci->sector_count -= len; 1354 ci->sector_count -= len;
1480 ci->idx = idx;
1481 1355
1482 return 0; 1356 return 0;
1483 }
1484
1485 /*
1486 * Handle a bvec that must be split between two or more targets.
1487 */
1488 return __split_bvec_across_targets(ci, ti, max);
1489} 1357}
1490 1358
1491/* 1359/*
@@ -1510,8 +1378,7 @@ static void __split_and_process_bio(struct mapped_device *md,
1510 ci.io->bio = bio; 1378 ci.io->bio = bio;
1511 ci.io->md = md; 1379 ci.io->md = md;
1512 spin_lock_init(&ci.io->endio_lock); 1380 spin_lock_init(&ci.io->endio_lock);
1513 ci.sector = bio->bi_sector; 1381 ci.sector = bio->bi_iter.bi_sector;
1514 ci.idx = bio->bi_idx;
1515 1382
1516 start_io_acct(ci.io); 1383 start_io_acct(ci.io);
1517 1384
@@ -2041,6 +1908,7 @@ static struct mapped_device *alloc_dev(int minor)
2041 init_waitqueue_head(&md->wait); 1908 init_waitqueue_head(&md->wait);
2042 INIT_WORK(&md->work, dm_wq_work); 1909 INIT_WORK(&md->work, dm_wq_work);
2043 init_waitqueue_head(&md->eventq); 1910 init_waitqueue_head(&md->eventq);
1911 init_completion(&md->kobj_holder.completion);
2044 1912
2045 md->disk->major = _major; 1913 md->disk->major = _major;
2046 md->disk->first_minor = minor; 1914 md->disk->first_minor = minor;
@@ -2902,20 +2770,14 @@ struct gendisk *dm_disk(struct mapped_device *md)
2902 2770
2903struct kobject *dm_kobject(struct mapped_device *md) 2771struct kobject *dm_kobject(struct mapped_device *md)
2904{ 2772{
2905 return &md->kobj; 2773 return &md->kobj_holder.kobj;
2906} 2774}
2907 2775
2908/*
2909 * struct mapped_device should not be exported outside of dm.c
2910 * so use this check to verify that kobj is part of md structure
2911 */
2912struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2776struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2913{ 2777{
2914 struct mapped_device *md; 2778 struct mapped_device *md;
2915 2779
2916 md = container_of(kobj, struct mapped_device, kobj); 2780 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2917 if (&md->kobj != kobj)
2918 return NULL;
2919 2781
2920 if (test_bit(DMF_FREEING, &md->flags) || 2782 if (test_bit(DMF_FREEING, &md->flags) ||
2921 dm_deleting_md(md)) 2783 dm_deleting_md(md))
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index c57ba550f69e..c4569f02f50f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -15,6 +15,8 @@
15#include <linux/list.h> 15#include <linux/list.h>
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18#include <linux/completion.h>
19#include <linux/kobject.h>
18 20
19#include "dm-stats.h" 21#include "dm-stats.h"
20 22
@@ -148,12 +150,27 @@ void dm_interface_exit(void);
148/* 150/*
149 * sysfs interface 151 * sysfs interface
150 */ 152 */
153struct dm_kobject_holder {
154 struct kobject kobj;
155 struct completion completion;
156};
157
158static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
159{
160 return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
161}
162
151int dm_sysfs_init(struct mapped_device *md); 163int dm_sysfs_init(struct mapped_device *md);
152void dm_sysfs_exit(struct mapped_device *md); 164void dm_sysfs_exit(struct mapped_device *md);
153struct kobject *dm_kobject(struct mapped_device *md); 165struct kobject *dm_kobject(struct mapped_device *md);
154struct mapped_device *dm_get_from_kobject(struct kobject *kobj); 166struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
155 167
156/* 168/*
169 * The kobject helper
170 */
171void dm_kobject_release(struct kobject *kobj);
172
173/*
157 * Targets for linear and striped mappings 174 * Targets for linear and striped mappings
158 */ 175 */
159int dm_linear_init(void); 176int dm_linear_init(void);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 3193aefe982b..e8b4574956c7 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -74,8 +74,8 @@ static void faulty_fail(struct bio *bio, int error)
74{ 74{
75 struct bio *b = bio->bi_private; 75 struct bio *b = bio->bi_private;
76 76
77 b->bi_size = bio->bi_size; 77 b->bi_iter.bi_size = bio->bi_iter.bi_size;
78 b->bi_sector = bio->bi_sector; 78 b->bi_iter.bi_sector = bio->bi_iter.bi_sector;
79 79
80 bio_put(bio); 80 bio_put(bio);
81 81
@@ -185,26 +185,31 @@ static void make_request(struct mddev *mddev, struct bio *bio)
185 return; 185 return;
186 } 186 }
187 187
188 if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE)) 188 if (check_sector(conf, bio->bi_iter.bi_sector,
189 bio_end_sector(bio), WRITE))
189 failit = 1; 190 failit = 1;
190 if (check_mode(conf, WritePersistent)) { 191 if (check_mode(conf, WritePersistent)) {
191 add_sector(conf, bio->bi_sector, WritePersistent); 192 add_sector(conf, bio->bi_iter.bi_sector,
193 WritePersistent);
192 failit = 1; 194 failit = 1;
193 } 195 }
194 if (check_mode(conf, WriteTransient)) 196 if (check_mode(conf, WriteTransient))
195 failit = 1; 197 failit = 1;
196 } else { 198 } else {
197 /* read request */ 199 /* read request */
198 if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ)) 200 if (check_sector(conf, bio->bi_iter.bi_sector,
201 bio_end_sector(bio), READ))
199 failit = 1; 202 failit = 1;
200 if (check_mode(conf, ReadTransient)) 203 if (check_mode(conf, ReadTransient))
201 failit = 1; 204 failit = 1;
202 if (check_mode(conf, ReadPersistent)) { 205 if (check_mode(conf, ReadPersistent)) {
203 add_sector(conf, bio->bi_sector, ReadPersistent); 206 add_sector(conf, bio->bi_iter.bi_sector,
207 ReadPersistent);
204 failit = 1; 208 failit = 1;
205 } 209 }
206 if (check_mode(conf, ReadFixable)) { 210 if (check_mode(conf, ReadFixable)) {
207 add_sector(conf, bio->bi_sector, ReadFixable); 211 add_sector(conf, bio->bi_iter.bi_sector,
212 ReadFixable);
208 failit = 1; 213 failit = 1;
209 } 214 }
210 } 215 }
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index f03fabd2b37b..56f534b4a2d2 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,65 +288,65 @@ static int linear_stop (struct mddev *mddev)
288 288
289static void linear_make_request(struct mddev *mddev, struct bio *bio) 289static void linear_make_request(struct mddev *mddev, struct bio *bio)
290{ 290{
291 char b[BDEVNAME_SIZE];
291 struct dev_info *tmp_dev; 292 struct dev_info *tmp_dev;
292 sector_t start_sector; 293 struct bio *split;
294 sector_t start_sector, end_sector, data_offset;
293 295
294 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 296 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
295 md_flush_request(mddev, bio); 297 md_flush_request(mddev, bio);
296 return; 298 return;
297 } 299 }
298 300
299 rcu_read_lock(); 301 do {
300 tmp_dev = which_dev(mddev, bio->bi_sector); 302 rcu_read_lock();
301 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
302
303
304 if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
305 || (bio->bi_sector < start_sector))) {
306 char b[BDEVNAME_SIZE];
307
308 printk(KERN_ERR
309 "md/linear:%s: make_request: Sector %llu out of bounds on "
310 "dev %s: %llu sectors, offset %llu\n",
311 mdname(mddev),
312 (unsigned long long)bio->bi_sector,
313 bdevname(tmp_dev->rdev->bdev, b),
314 (unsigned long long)tmp_dev->rdev->sectors,
315 (unsigned long long)start_sector);
316 rcu_read_unlock();
317 bio_io_error(bio);
318 return;
319 }
320 if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) {
321 /* This bio crosses a device boundary, so we have to
322 * split it.
323 */
324 struct bio_pair *bp;
325 sector_t end_sector = tmp_dev->end_sector;
326 303
327 rcu_read_unlock(); 304 tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
328 305 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
329 bp = bio_split(bio, end_sector - bio->bi_sector); 306 end_sector = tmp_dev->end_sector;
307 data_offset = tmp_dev->rdev->data_offset;
308 bio->bi_bdev = tmp_dev->rdev->bdev;
330 309
331 linear_make_request(mddev, &bp->bio1); 310 rcu_read_unlock();
332 linear_make_request(mddev, &bp->bio2);
333 bio_pair_release(bp);
334 return;
335 }
336
337 bio->bi_bdev = tmp_dev->rdev->bdev;
338 bio->bi_sector = bio->bi_sector - start_sector
339 + tmp_dev->rdev->data_offset;
340 rcu_read_unlock();
341 311
342 if (unlikely((bio->bi_rw & REQ_DISCARD) && 312 if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
343 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { 313 bio->bi_iter.bi_sector < start_sector))
344 /* Just ignore it */ 314 goto out_of_bounds;
345 bio_endio(bio, 0); 315
346 return; 316 if (unlikely(bio_end_sector(bio) > end_sector)) {
347 } 317 /* This bio crosses a device boundary, so we have to
318 * split it.
319 */
320 split = bio_split(bio, end_sector -
321 bio->bi_iter.bi_sector,
322 GFP_NOIO, fs_bio_set);
323 bio_chain(split, bio);
324 } else {
325 split = bio;
326 }
348 327
349 generic_make_request(bio); 328 split->bi_iter.bi_sector = split->bi_iter.bi_sector -
329 start_sector + data_offset;
330
331 if (unlikely((split->bi_rw & REQ_DISCARD) &&
332 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
333 /* Just ignore it */
334 bio_endio(split, 0);
335 } else
336 generic_make_request(split);
337 } while (split != bio);
338 return;
339
340out_of_bounds:
341 printk(KERN_ERR
342 "md/linear:%s: make_request: Sector %llu out of bounds on "
343 "dev %s: %llu sectors, offset %llu\n",
344 mdname(mddev),
345 (unsigned long long)bio->bi_iter.bi_sector,
346 bdevname(tmp_dev->rdev->bdev, b),
347 (unsigned long long)tmp_dev->rdev->sectors,
348 (unsigned long long)start_sector);
349 bio_io_error(bio);
350} 350}
351 351
352static void linear_status (struct seq_file *seq, struct mddev *mddev) 352static void linear_status (struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 21f4d7ff0da2..4ad5cc4e63e8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -393,7 +393,7 @@ static void md_submit_flush_data(struct work_struct *ws)
393 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 393 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
394 struct bio *bio = mddev->flush_bio; 394 struct bio *bio = mddev->flush_bio;
395 395
396 if (bio->bi_size == 0) 396 if (bio->bi_iter.bi_size == 0)
397 /* an empty barrier - all done */ 397 /* an empty barrier - all done */
398 bio_endio(bio, 0); 398 bio_endio(bio, 0);
399 else { 399 else {
@@ -754,7 +754,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
754 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 754 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
755 755
756 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 756 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
757 bio->bi_sector = sector; 757 bio->bi_iter.bi_sector = sector;
758 bio_add_page(bio, page, size, 0); 758 bio_add_page(bio, page, size, 0);
759 bio->bi_private = rdev; 759 bio->bi_private = rdev;
760 bio->bi_end_io = super_written; 760 bio->bi_end_io = super_written;
@@ -782,18 +782,16 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
782 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 782 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
783 int ret; 783 int ret;
784 784
785 rw |= REQ_SYNC;
786
787 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 785 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
788 rdev->meta_bdev : rdev->bdev; 786 rdev->meta_bdev : rdev->bdev;
789 if (metadata_op) 787 if (metadata_op)
790 bio->bi_sector = sector + rdev->sb_start; 788 bio->bi_iter.bi_sector = sector + rdev->sb_start;
791 else if (rdev->mddev->reshape_position != MaxSector && 789 else if (rdev->mddev->reshape_position != MaxSector &&
792 (rdev->mddev->reshape_backwards == 790 (rdev->mddev->reshape_backwards ==
793 (sector >= rdev->mddev->reshape_position))) 791 (sector >= rdev->mddev->reshape_position)))
794 bio->bi_sector = sector + rdev->new_data_offset; 792 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
795 else 793 else
796 bio->bi_sector = sector + rdev->data_offset; 794 bio->bi_iter.bi_sector = sector + rdev->data_offset;
797 bio_add_page(bio, page, size, 0); 795 bio_add_page(bio, page, size, 0);
798 submit_bio_wait(rw, bio); 796 submit_bio_wait(rw, bio);
799 797
@@ -1077,6 +1075,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1077 rdev->raid_disk = -1; 1075 rdev->raid_disk = -1;
1078 clear_bit(Faulty, &rdev->flags); 1076 clear_bit(Faulty, &rdev->flags);
1079 clear_bit(In_sync, &rdev->flags); 1077 clear_bit(In_sync, &rdev->flags);
1078 clear_bit(Bitmap_sync, &rdev->flags);
1080 clear_bit(WriteMostly, &rdev->flags); 1079 clear_bit(WriteMostly, &rdev->flags);
1081 1080
1082 if (mddev->raid_disks == 0) { 1081 if (mddev->raid_disks == 0) {
@@ -1155,6 +1154,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1155 */ 1154 */
1156 if (ev1 < mddev->bitmap->events_cleared) 1155 if (ev1 < mddev->bitmap->events_cleared)
1157 return 0; 1156 return 0;
1157 if (ev1 < mddev->events)
1158 set_bit(Bitmap_sync, &rdev->flags);
1158 } else { 1159 } else {
1159 if (ev1 < mddev->events) 1160 if (ev1 < mddev->events)
1160 /* just a hot-add of a new device, leave raid_disk at -1 */ 1161 /* just a hot-add of a new device, leave raid_disk at -1 */
@@ -1170,6 +1171,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1170 desc->raid_disk < mddev->raid_disks */) { 1171 desc->raid_disk < mddev->raid_disks */) {
1171 set_bit(In_sync, &rdev->flags); 1172 set_bit(In_sync, &rdev->flags);
1172 rdev->raid_disk = desc->raid_disk; 1173 rdev->raid_disk = desc->raid_disk;
1174 rdev->saved_raid_disk = desc->raid_disk;
1173 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1175 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1174 /* active but not in sync implies recovery up to 1176 /* active but not in sync implies recovery up to
1175 * reshape position. We don't know exactly where 1177 * reshape position. We don't know exactly where
@@ -1563,6 +1565,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1563 rdev->raid_disk = -1; 1565 rdev->raid_disk = -1;
1564 clear_bit(Faulty, &rdev->flags); 1566 clear_bit(Faulty, &rdev->flags);
1565 clear_bit(In_sync, &rdev->flags); 1567 clear_bit(In_sync, &rdev->flags);
1568 clear_bit(Bitmap_sync, &rdev->flags);
1566 clear_bit(WriteMostly, &rdev->flags); 1569 clear_bit(WriteMostly, &rdev->flags);
1567 1570
1568 if (mddev->raid_disks == 0) { 1571 if (mddev->raid_disks == 0) {
@@ -1645,6 +1648,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1645 */ 1648 */
1646 if (ev1 < mddev->bitmap->events_cleared) 1649 if (ev1 < mddev->bitmap->events_cleared)
1647 return 0; 1650 return 0;
1651 if (ev1 < mddev->events)
1652 set_bit(Bitmap_sync, &rdev->flags);
1648 } else { 1653 } else {
1649 if (ev1 < mddev->events) 1654 if (ev1 < mddev->events)
1650 /* just a hot-add of a new device, leave raid_disk at -1 */ 1655 /* just a hot-add of a new device, leave raid_disk at -1 */
@@ -1665,10 +1670,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1665 set_bit(Faulty, &rdev->flags); 1670 set_bit(Faulty, &rdev->flags);
1666 break; 1671 break;
1667 default: 1672 default:
1673 rdev->saved_raid_disk = role;
1668 if ((le32_to_cpu(sb->feature_map) & 1674 if ((le32_to_cpu(sb->feature_map) &
1669 MD_FEATURE_RECOVERY_OFFSET)) 1675 MD_FEATURE_RECOVERY_OFFSET)) {
1670 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1676 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1671 else 1677 if (!(le32_to_cpu(sb->feature_map) &
1678 MD_FEATURE_RECOVERY_BITMAP))
1679 rdev->saved_raid_disk = -1;
1680 } else
1672 set_bit(In_sync, &rdev->flags); 1681 set_bit(In_sync, &rdev->flags);
1673 rdev->raid_disk = role; 1682 rdev->raid_disk = role;
1674 break; 1683 break;
@@ -1730,6 +1739,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1730 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1739 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1731 sb->recovery_offset = 1740 sb->recovery_offset =
1732 cpu_to_le64(rdev->recovery_offset); 1741 cpu_to_le64(rdev->recovery_offset);
1742 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1743 sb->feature_map |=
1744 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1733 } 1745 }
1734 if (test_bit(Replacement, &rdev->flags)) 1746 if (test_bit(Replacement, &rdev->flags))
1735 sb->feature_map |= 1747 sb->feature_map |=
@@ -2471,8 +2483,7 @@ repeat:
2471 if (rdev->sb_loaded != 1) 2483 if (rdev->sb_loaded != 1)
2472 continue; /* no noise on spare devices */ 2484 continue; /* no noise on spare devices */
2473 2485
2474 if (!test_bit(Faulty, &rdev->flags) && 2486 if (!test_bit(Faulty, &rdev->flags)) {
2475 rdev->saved_raid_disk == -1) {
2476 md_super_write(mddev,rdev, 2487 md_super_write(mddev,rdev,
2477 rdev->sb_start, rdev->sb_size, 2488 rdev->sb_start, rdev->sb_size,
2478 rdev->sb_page); 2489 rdev->sb_page);
@@ -2488,11 +2499,9 @@ repeat:
2488 rdev->badblocks.size = 0; 2499 rdev->badblocks.size = 0;
2489 } 2500 }
2490 2501
2491 } else if (test_bit(Faulty, &rdev->flags)) 2502 } else
2492 pr_debug("md: %s (skipping faulty)\n", 2503 pr_debug("md: %s (skipping faulty)\n",
2493 bdevname(rdev->bdev, b)); 2504 bdevname(rdev->bdev, b));
2494 else
2495 pr_debug("(skipping incremental s/r ");
2496 2505
2497 if (mddev->level == LEVEL_MULTIPATH) 2506 if (mddev->level == LEVEL_MULTIPATH)
2498 /* only need to write one superblock... */ 2507 /* only need to write one superblock... */
@@ -2608,6 +2617,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2608 * blocked - sets the Blocked flags 2617 * blocked - sets the Blocked flags
2609 * -blocked - clears the Blocked and possibly simulates an error 2618 * -blocked - clears the Blocked and possibly simulates an error
2610 * insync - sets Insync providing device isn't active 2619 * insync - sets Insync providing device isn't active
2620 * -insync - clear Insync for a device with a slot assigned,
2621 * so that it gets rebuilt based on bitmap
2611 * write_error - sets WriteErrorSeen 2622 * write_error - sets WriteErrorSeen
2612 * -write_error - clears WriteErrorSeen 2623 * -write_error - clears WriteErrorSeen
2613 */ 2624 */
@@ -2656,6 +2667,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2656 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2667 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2657 set_bit(In_sync, &rdev->flags); 2668 set_bit(In_sync, &rdev->flags);
2658 err = 0; 2669 err = 0;
2670 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
2671 clear_bit(In_sync, &rdev->flags);
2672 rdev->saved_raid_disk = rdev->raid_disk;
2673 rdev->raid_disk = -1;
2674 err = 0;
2659 } else if (cmd_match(buf, "write_error")) { 2675 } else if (cmd_match(buf, "write_error")) {
2660 set_bit(WriteErrorSeen, &rdev->flags); 2676 set_bit(WriteErrorSeen, &rdev->flags);
2661 err = 0; 2677 err = 0;
@@ -2788,6 +2804,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2788 else 2804 else
2789 rdev->saved_raid_disk = -1; 2805 rdev->saved_raid_disk = -1;
2790 clear_bit(In_sync, &rdev->flags); 2806 clear_bit(In_sync, &rdev->flags);
2807 clear_bit(Bitmap_sync, &rdev->flags);
2791 err = rdev->mddev->pers-> 2808 err = rdev->mddev->pers->
2792 hot_add_disk(rdev->mddev, rdev); 2809 hot_add_disk(rdev->mddev, rdev);
2793 if (err) { 2810 if (err) {
@@ -3582,6 +3599,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3582 pers->run(mddev); 3599 pers->run(mddev);
3583 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3600 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3584 mddev_resume(mddev); 3601 mddev_resume(mddev);
3602 if (!mddev->thread)
3603 md_update_sb(mddev, 1);
3585 sysfs_notify(&mddev->kobj, NULL, "level"); 3604 sysfs_notify(&mddev->kobj, NULL, "level");
3586 md_new_event(mddev); 3605 md_new_event(mddev);
3587 return rv; 3606 return rv;
@@ -5760,8 +5779,10 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5760 info->raid_disk < mddev->raid_disks) { 5779 info->raid_disk < mddev->raid_disks) {
5761 rdev->raid_disk = info->raid_disk; 5780 rdev->raid_disk = info->raid_disk;
5762 set_bit(In_sync, &rdev->flags); 5781 set_bit(In_sync, &rdev->flags);
5782 clear_bit(Bitmap_sync, &rdev->flags);
5763 } else 5783 } else
5764 rdev->raid_disk = -1; 5784 rdev->raid_disk = -1;
5785 rdev->saved_raid_disk = rdev->raid_disk;
5765 } else 5786 } else
5766 super_types[mddev->major_version]. 5787 super_types[mddev->major_version].
5767 validate_super(mddev, rdev); 5788 validate_super(mddev, rdev);
@@ -5774,11 +5795,6 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5774 return -EINVAL; 5795 return -EINVAL;
5775 } 5796 }
5776 5797
5777 if (test_bit(In_sync, &rdev->flags))
5778 rdev->saved_raid_disk = rdev->raid_disk;
5779 else
5780 rdev->saved_raid_disk = -1;
5781
5782 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 5798 clear_bit(In_sync, &rdev->flags); /* just to be sure */
5783 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5799 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5784 set_bit(WriteMostly, &rdev->flags); 5800 set_bit(WriteMostly, &rdev->flags);
@@ -6328,6 +6344,32 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6328 return 0; 6344 return 0;
6329} 6345}
6330 6346
6347static inline bool md_ioctl_valid(unsigned int cmd)
6348{
6349 switch (cmd) {
6350 case ADD_NEW_DISK:
6351 case BLKROSET:
6352 case GET_ARRAY_INFO:
6353 case GET_BITMAP_FILE:
6354 case GET_DISK_INFO:
6355 case HOT_ADD_DISK:
6356 case HOT_REMOVE_DISK:
6357 case PRINT_RAID_DEBUG:
6358 case RAID_AUTORUN:
6359 case RAID_VERSION:
6360 case RESTART_ARRAY_RW:
6361 case RUN_ARRAY:
6362 case SET_ARRAY_INFO:
6363 case SET_BITMAP_FILE:
6364 case SET_DISK_FAULTY:
6365 case STOP_ARRAY:
6366 case STOP_ARRAY_RO:
6367 return true;
6368 default:
6369 return false;
6370 }
6371}
6372
6331static int md_ioctl(struct block_device *bdev, fmode_t mode, 6373static int md_ioctl(struct block_device *bdev, fmode_t mode,
6332 unsigned int cmd, unsigned long arg) 6374 unsigned int cmd, unsigned long arg)
6333{ 6375{
@@ -6336,6 +6378,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6336 struct mddev *mddev = NULL; 6378 struct mddev *mddev = NULL;
6337 int ro; 6379 int ro;
6338 6380
6381 if (!md_ioctl_valid(cmd))
6382 return -ENOTTY;
6383
6339 switch (cmd) { 6384 switch (cmd) {
6340 case RAID_VERSION: 6385 case RAID_VERSION:
6341 case GET_ARRAY_INFO: 6386 case GET_ARRAY_INFO:
@@ -7706,10 +7751,12 @@ static int remove_and_add_spares(struct mddev *mddev,
7706 if (test_bit(Faulty, &rdev->flags)) 7751 if (test_bit(Faulty, &rdev->flags))
7707 continue; 7752 continue;
7708 if (mddev->ro && 7753 if (mddev->ro &&
7709 rdev->saved_raid_disk < 0) 7754 ! (rdev->saved_raid_disk >= 0 &&
7755 !test_bit(Bitmap_sync, &rdev->flags)))
7710 continue; 7756 continue;
7711 7757
7712 rdev->recovery_offset = 0; 7758 if (rdev->saved_raid_disk < 0)
7759 rdev->recovery_offset = 0;
7713 if (mddev->pers-> 7760 if (mddev->pers->
7714 hot_add_disk(mddev, rdev) == 0) { 7761 hot_add_disk(mddev, rdev) == 0) {
7715 if (sysfs_link_rdev(mddev, rdev)) 7762 if (sysfs_link_rdev(mddev, rdev))
@@ -7787,9 +7834,12 @@ void md_check_recovery(struct mddev *mddev)
7787 * As we only add devices that are already in-sync, 7834 * As we only add devices that are already in-sync,
7788 * we can activate the spares immediately. 7835 * we can activate the spares immediately.
7789 */ 7836 */
7790 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7791 remove_and_add_spares(mddev, NULL); 7837 remove_and_add_spares(mddev, NULL);
7792 mddev->pers->spare_active(mddev); 7838 /* There is no thread, but we need to call
7839 * ->spare_active and clear saved_raid_disk
7840 */
7841 md_reap_sync_thread(mddev);
7842 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7793 goto unlock; 7843 goto unlock;
7794 } 7844 }
7795 7845
@@ -7926,14 +7976,10 @@ void md_reap_sync_thread(struct mddev *mddev)
7926 mddev->pers->finish_reshape(mddev); 7976 mddev->pers->finish_reshape(mddev);
7927 7977
7928 /* If array is no-longer degraded, then any saved_raid_disk 7978 /* If array is no-longer degraded, then any saved_raid_disk
7929 * information must be scrapped. Also if any device is now 7979 * information must be scrapped.
7930 * In_sync we must scrape the saved_raid_disk for that device
7931 * do the superblock for an incrementally recovered device
7932 * written out.
7933 */ 7980 */
7934 rdev_for_each(rdev, mddev) 7981 if (!mddev->degraded)
7935 if (!mddev->degraded || 7982 rdev_for_each(rdev, mddev)
7936 test_bit(In_sync, &rdev->flags))
7937 rdev->saved_raid_disk = -1; 7983 rdev->saved_raid_disk = -1;
7938 7984
7939 md_update_sb(mddev, 1); 7985 md_update_sb(mddev, 1);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2f5cc8a7ef3e..07bba96de260 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -106,7 +106,7 @@ struct md_rdev {
106 */ 106 */
107 struct work_struct del_work; /* used for delayed sysfs removal */ 107 struct work_struct del_work; /* used for delayed sysfs removal */
108 108
109 struct sysfs_dirent *sysfs_state; /* handle for 'state' 109 struct kernfs_node *sysfs_state; /* handle for 'state'
110 * sysfs entry */ 110 * sysfs entry */
111 111
112 struct badblocks { 112 struct badblocks {
@@ -129,6 +129,9 @@ struct md_rdev {
129enum flag_bits { 129enum flag_bits {
130 Faulty, /* device is known to have a fault */ 130 Faulty, /* device is known to have a fault */
131 In_sync, /* device is in_sync with rest of array */ 131 In_sync, /* device is in_sync with rest of array */
132 Bitmap_sync, /* ..actually, not quite In_sync. Need a
133 * bitmap-based recovery to get fully in sync
134 */
132 Unmerged, /* device is being added to array and should 135 Unmerged, /* device is being added to array and should
133 * be considerred for bvec_merge_fn but not 136 * be considerred for bvec_merge_fn but not
134 * yet for actual IO 137 * yet for actual IO
@@ -376,10 +379,10 @@ struct mddev {
376 sector_t resync_max; /* resync should pause 379 sector_t resync_max; /* resync should pause
377 * when it gets here */ 380 * when it gets here */
378 381
379 struct sysfs_dirent *sysfs_state; /* handle for 'array_state' 382 struct kernfs_node *sysfs_state; /* handle for 'array_state'
380 * file in sysfs. 383 * file in sysfs.
381 */ 384 */
382 struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ 385 struct kernfs_node *sysfs_action; /* handle for 'sync_action' */
383 386
384 struct work_struct del_work; /* used for delayed sysfs removal */ 387 struct work_struct del_work; /* used for delayed sysfs removal */
385 388
@@ -498,13 +501,13 @@ struct md_sysfs_entry {
498}; 501};
499extern struct attribute_group md_bitmap_group; 502extern struct attribute_group md_bitmap_group;
500 503
501static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) 504static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
502{ 505{
503 if (sd) 506 if (sd)
504 return sysfs_get_dirent(sd, name); 507 return sysfs_get_dirent(sd, name);
505 return sd; 508 return sd;
506} 509}
507static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) 510static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
508{ 511{
509 if (sd) 512 if (sd)
510 sysfs_notify_dirent(sd); 513 sysfs_notify_dirent(sd);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1642eae75a33..849ad39f547b 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -100,7 +100,7 @@ static void multipath_end_request(struct bio *bio, int error)
100 md_error (mp_bh->mddev, rdev); 100 md_error (mp_bh->mddev, rdev);
101 printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 101 printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
102 bdevname(rdev->bdev,b), 102 bdevname(rdev->bdev,b),
103 (unsigned long long)bio->bi_sector); 103 (unsigned long long)bio->bi_iter.bi_sector);
104 multipath_reschedule_retry(mp_bh); 104 multipath_reschedule_retry(mp_bh);
105 } else 105 } else
106 multipath_end_bh_io(mp_bh, error); 106 multipath_end_bh_io(mp_bh, error);
@@ -132,7 +132,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
132 multipath = conf->multipaths + mp_bh->path; 132 multipath = conf->multipaths + mp_bh->path;
133 133
134 mp_bh->bio = *bio; 134 mp_bh->bio = *bio;
135 mp_bh->bio.bi_sector += multipath->rdev->data_offset; 135 mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
136 mp_bh->bio.bi_bdev = multipath->rdev->bdev; 136 mp_bh->bio.bi_bdev = multipath->rdev->bdev;
137 mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; 137 mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT;
138 mp_bh->bio.bi_end_io = multipath_end_request; 138 mp_bh->bio.bi_end_io = multipath_end_request;
@@ -355,21 +355,22 @@ static void multipathd(struct md_thread *thread)
355 spin_unlock_irqrestore(&conf->device_lock, flags); 355 spin_unlock_irqrestore(&conf->device_lock, flags);
356 356
357 bio = &mp_bh->bio; 357 bio = &mp_bh->bio;
358 bio->bi_sector = mp_bh->master_bio->bi_sector; 358 bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
359 359
360 if ((mp_bh->path = multipath_map (conf))<0) { 360 if ((mp_bh->path = multipath_map (conf))<0) {
361 printk(KERN_ALERT "multipath: %s: unrecoverable IO read" 361 printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
362 " error for block %llu\n", 362 " error for block %llu\n",
363 bdevname(bio->bi_bdev,b), 363 bdevname(bio->bi_bdev,b),
364 (unsigned long long)bio->bi_sector); 364 (unsigned long long)bio->bi_iter.bi_sector);
365 multipath_end_bh_io(mp_bh, -EIO); 365 multipath_end_bh_io(mp_bh, -EIO);
366 } else { 366 } else {
367 printk(KERN_ERR "multipath: %s: redirecting sector %llu" 367 printk(KERN_ERR "multipath: %s: redirecting sector %llu"
368 " to another IO path\n", 368 " to another IO path\n",
369 bdevname(bio->bi_bdev,b), 369 bdevname(bio->bi_bdev,b),
370 (unsigned long long)bio->bi_sector); 370 (unsigned long long)bio->bi_iter.bi_sector);
371 *bio = *(mp_bh->master_bio); 371 *bio = *(mp_bh->master_bio);
372 bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; 372 bio->bi_iter.bi_sector +=
373 conf->multipaths[mp_bh->path].rdev->data_offset;
373 bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; 374 bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
374 bio->bi_rw |= REQ_FAILFAST_TRANSPORT; 375 bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
375 bio->bi_end_io = multipath_end_request; 376 bio->bi_end_io = multipath_end_request;
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 19b268795415..0c2dec7aec20 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA
6 ---help--- 6 ---help---
7 Library providing immutable on-disk data structure support for 7 Library providing immutable on-disk data structure support for
8 device-mapper targets such as the thin provisioning target. 8 device-mapper targets such as the thin provisioning target.
9
10config DM_DEBUG_BLOCK_STACK_TRACING
11 boolean "Keep stack trace of persistent data block lock holders"
12 depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
13 select STACKTRACE
14 ---help---
15 Enable this for messages that may help debug problems with the
16 block manager locking used by thin provisioning and caching.
17
18 If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 064a3c271baa..455f79279a16 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -104,7 +104,7 @@ static int __check_holder(struct block_lock *lock)
104 104
105 for (i = 0; i < MAX_HOLDERS; i++) { 105 for (i = 0; i < MAX_HOLDERS; i++) {
106 if (lock->holders[i] == current) { 106 if (lock->holders[i] == current) {
107 DMERR("recursive lock detected in pool metadata"); 107 DMERR("recursive lock detected in metadata");
108#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 108#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
109 DMERR("previously held here:"); 109 DMERR("previously held here:");
110 print_stack_trace(lock->traces + i, 4); 110 print_stack_trace(lock->traces + i, 4);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 468e371ee9b2..416060c25709 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -770,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
770 770
771/*----------------------------------------------------------------*/ 771/*----------------------------------------------------------------*/
772 772
773static int find_highest_key(struct ro_spine *s, dm_block_t block, 773static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
774 uint64_t *result_key, dm_block_t *next_block) 774 uint64_t *result_key, dm_block_t *next_block)
775{ 775{
776 int i, r; 776 int i, r;
777 uint32_t flags; 777 uint32_t flags;
@@ -788,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
788 else 788 else
789 i--; 789 i--;
790 790
791 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 791 if (find_highest)
792 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
793 else
794 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
795
792 if (next_block || flags & INTERNAL_NODE) 796 if (next_block || flags & INTERNAL_NODE)
793 block = value64(ro_node(s), i); 797 block = value64(ro_node(s), i);
794 798
@@ -799,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
799 return 0; 803 return 0;
800} 804}
801 805
802int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 806static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
803 uint64_t *result_keys) 807 bool find_highest, uint64_t *result_keys)
804{ 808{
805 int r = 0, count = 0, level; 809 int r = 0, count = 0, level;
806 struct ro_spine spine; 810 struct ro_spine spine;
807 811
808 init_ro_spine(&spine, info); 812 init_ro_spine(&spine, info);
809 for (level = 0; level < info->levels; level++) { 813 for (level = 0; level < info->levels; level++) {
810 r = find_highest_key(&spine, root, result_keys + level, 814 r = find_key(&spine, root, find_highest, result_keys + level,
811 level == info->levels - 1 ? NULL : &root); 815 level == info->levels - 1 ? NULL : &root);
812 if (r == -ENODATA) { 816 if (r == -ENODATA) {
813 r = 0; 817 r = 0;
814 break; 818 break;
@@ -822,8 +826,23 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
822 826
823 return r ? r : count; 827 return r ? r : count;
824} 828}
829
830int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
831 uint64_t *result_keys)
832{
833 return dm_btree_find_key(info, root, true, result_keys);
834}
825EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 835EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
826 836
837int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
838 uint64_t *result_keys)
839{
840 return dm_btree_find_key(info, root, false, result_keys);
841}
842EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
843
844/*----------------------------------------------------------------*/
845
827/* 846/*
828 * FIXME: We shouldn't use a recursive algorithm when we have limited stack 847 * FIXME: We shouldn't use a recursive algorithm when we have limited stack
829 * space. Also this only works for single level trees. 848 * space. Also this only works for single level trees.
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 8672d159e0b5..dacfc34180b4 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -137,6 +137,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
137/* 137/*
138 * Returns < 0 on failure. Otherwise the number of key entries that have 138 * Returns < 0 on failure. Otherwise the number of key entries that have
139 * been filled out. Remember trees can have zero entries, and as such have 139 * been filled out. Remember trees can have zero entries, and as such have
140 * no lowest key.
141 */
142int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
143 uint64_t *result_keys);
144
145/*
146 * Returns < 0 on failure. Otherwise the number of key entries that have
147 * been filled out. Remember trees can have zero entries, and as such have
140 * no highest key. 148 * no highest key.
141 */ 149 */
142int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 150int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 466a60bbd716..aacbe70c2c2e 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
245 return -EINVAL; 245 return -EINVAL;
246 } 246 }
247 247
248 /*
249 * We need to set this before the dm_tm_new_block() call below.
250 */
251 ll->nr_blocks = nr_blocks;
248 for (i = old_blocks; i < blocks; i++) { 252 for (i = old_blocks; i < blocks; i++) {
249 struct dm_block *b; 253 struct dm_block *b;
250 struct disk_index_entry idx; 254 struct disk_index_entry idx;
@@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
252 r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); 256 r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
253 if (r < 0) 257 if (r < 0)
254 return r; 258 return r;
259
255 idx.blocknr = cpu_to_le64(dm_block_location(b)); 260 idx.blocknr = cpu_to_le64(dm_block_location(b));
256 261
257 r = dm_tm_unlock(ll->tm, b); 262 r = dm_tm_unlock(ll->tm, b);
@@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
266 return r; 271 return r;
267 } 272 }
268 273
269 ll->nr_blocks = nr_blocks;
270 return 0; 274 return 0;
271} 275}
272 276
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 58fc1eef7499..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
91 dm_block_t block; 91 dm_block_t block;
92}; 92};
93 93
94struct bop_ring_buffer {
95 unsigned begin;
96 unsigned end;
97 struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
98};
99
100static void brb_init(struct bop_ring_buffer *brb)
101{
102 brb->begin = 0;
103 brb->end = 0;
104}
105
106static bool brb_empty(struct bop_ring_buffer *brb)
107{
108 return brb->begin == brb->end;
109}
110
111static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
112{
113 unsigned r = old + 1;
114 return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
115}
116
117static int brb_push(struct bop_ring_buffer *brb,
118 enum block_op_type type, dm_block_t b)
119{
120 struct block_op *bop;
121 unsigned next = brb_next(brb, brb->end);
122
123 /*
124 * We don't allow the last bop to be filled, this way we can
125 * differentiate between full and empty.
126 */
127 if (next == brb->begin)
128 return -ENOMEM;
129
130 bop = brb->bops + brb->end;
131 bop->type = type;
132 bop->block = b;
133
134 brb->end = next;
135
136 return 0;
137}
138
139static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
140{
141 struct block_op *bop;
142
143 if (brb_empty(brb))
144 return -ENODATA;
145
146 bop = brb->bops + brb->begin;
147 result->type = bop->type;
148 result->block = bop->block;
149
150 brb->begin = brb_next(brb, brb->begin);
151
152 return 0;
153}
154
155/*----------------------------------------------------------------*/
156
94struct sm_metadata { 157struct sm_metadata {
95 struct dm_space_map sm; 158 struct dm_space_map sm;
96 159
@@ -101,25 +164,20 @@ struct sm_metadata {
101 164
102 unsigned recursion_count; 165 unsigned recursion_count;
103 unsigned allocated_this_transaction; 166 unsigned allocated_this_transaction;
104 unsigned nr_uncommitted; 167 struct bop_ring_buffer uncommitted;
105 struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
106 168
107 struct threshold threshold; 169 struct threshold threshold;
108}; 170};
109 171
110static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) 172static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
111{ 173{
112 struct block_op *op; 174 int r = brb_push(&smm->uncommitted, type, b);
113 175
114 if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { 176 if (r) {
115 DMERR("too many recursive allocations"); 177 DMERR("too many recursive allocations");
116 return -ENOMEM; 178 return -ENOMEM;
117 } 179 }
118 180
119 op = smm->uncommitted + smm->nr_uncommitted++;
120 op->type = type;
121 op->block = b;
122
123 return 0; 181 return 0;
124} 182}
125 183
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
158 return -ENOMEM; 216 return -ENOMEM;
159 } 217 }
160 218
161 if (smm->recursion_count == 1 && smm->nr_uncommitted) { 219 if (smm->recursion_count == 1) {
162 while (smm->nr_uncommitted && !r) { 220 while (!brb_empty(&smm->uncommitted)) {
163 smm->nr_uncommitted--; 221 struct block_op bop;
164 r = commit_bop(smm, smm->uncommitted + 222
165 smm->nr_uncommitted); 223 r = brb_pop(&smm->uncommitted, &bop);
224 if (r) {
225 DMERR("bug in bop ring buffer");
226 break;
227 }
228
229 r = commit_bop(smm, &bop);
166 if (r) 230 if (r)
167 break; 231 break;
168 } 232 }
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
217static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, 281static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
218 uint32_t *result) 282 uint32_t *result)
219{ 283{
220 int r, i; 284 int r;
285 unsigned i;
221 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 286 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
222 unsigned adjustment = 0; 287 unsigned adjustment = 0;
223 288
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
225 * We may have some uncommitted adjustments to add. This list 290 * We may have some uncommitted adjustments to add. This list
226 * should always be really short. 291 * should always be really short.
227 */ 292 */
228 for (i = 0; i < smm->nr_uncommitted; i++) { 293 for (i = smm->uncommitted.begin;
229 struct block_op *op = smm->uncommitted + i; 294 i != smm->uncommitted.end;
295 i = brb_next(&smm->uncommitted, i)) {
296 struct block_op *op = smm->uncommitted.bops + i;
230 297
231 if (op->block != b) 298 if (op->block != b)
232 continue; 299 continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
254static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, 321static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
255 dm_block_t b, int *result) 322 dm_block_t b, int *result)
256{ 323{
257 int r, i, adjustment = 0; 324 int r, adjustment = 0;
325 unsigned i;
258 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 326 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
259 uint32_t rc; 327 uint32_t rc;
260 328
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
262 * We may have some uncommitted adjustments to add. This list 330 * We may have some uncommitted adjustments to add. This list
263 * should always be really short. 331 * should always be really short.
264 */ 332 */
265 for (i = 0; i < smm->nr_uncommitted; i++) { 333 for (i = smm->uncommitted.begin;
266 struct block_op *op = smm->uncommitted + i; 334 i != smm->uncommitted.end;
335 i = brb_next(&smm->uncommitted, i)) {
336
337 struct block_op *op = smm->uncommitted.bops + i;
267 338
268 if (op->block != b) 339 if (op->block != b)
269 continue; 340 continue;
@@ -385,13 +456,13 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
385 456
386 int r = sm_metadata_new_block_(sm, b); 457 int r = sm_metadata_new_block_(sm, b);
387 if (r) { 458 if (r) {
388 DMERR("unable to allocate new metadata block"); 459 DMERR_LIMIT("unable to allocate new metadata block");
389 return r; 460 return r;
390 } 461 }
391 462
392 r = sm_metadata_get_nr_free(sm, &count); 463 r = sm_metadata_get_nr_free(sm, &count);
393 if (r) { 464 if (r) {
394 DMERR("couldn't get free block count"); 465 DMERR_LIMIT("couldn't get free block count");
395 return r; 466 return r;
396 } 467 }
397 468
@@ -608,20 +679,38 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
608 * Flick into a mode where all blocks get allocated in the new area. 679 * Flick into a mode where all blocks get allocated in the new area.
609 */ 680 */
610 smm->begin = old_len; 681 smm->begin = old_len;
611 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); 682 memcpy(sm, &bootstrap_ops, sizeof(*sm));
612 683
613 /* 684 /*
614 * Extend. 685 * Extend.
615 */ 686 */
616 r = sm_ll_extend(&smm->ll, extra_blocks); 687 r = sm_ll_extend(&smm->ll, extra_blocks);
688 if (r)
689 goto out;
617 690
618 /* 691 /*
619 * Switch back to normal behaviour. 692 * We repeatedly increment then commit until the commit doesn't
693 * allocate any new blocks.
620 */ 694 */
621 memcpy(&smm->sm, &ops, sizeof(smm->sm)); 695 do {
622 for (i = old_len; !r && i < smm->begin; i++) 696 for (i = old_len; !r && i < smm->begin; i++) {
623 r = sm_ll_inc(&smm->ll, i, &ev); 697 r = sm_ll_inc(&smm->ll, i, &ev);
698 if (r)
699 goto out;
700 }
701 old_len = smm->begin;
702
703 r = sm_ll_commit(&smm->ll);
704 if (r)
705 goto out;
706
707 } while (old_len != smm->begin);
624 708
709out:
710 /*
711 * Switch back to normal behaviour.
712 */
713 memcpy(sm, &ops, sizeof(*sm));
625 return r; 714 return r;
626} 715}
627 716
@@ -653,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
653 smm->begin = superblock + 1; 742 smm->begin = superblock + 1;
654 smm->recursion_count = 0; 743 smm->recursion_count = 0;
655 smm->allocated_this_transaction = 0; 744 smm->allocated_this_transaction = 0;
656 smm->nr_uncommitted = 0; 745 brb_init(&smm->uncommitted);
657 threshold_init(&smm->threshold); 746 threshold_init(&smm->threshold);
658 747
659 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); 748 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -662,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
662 if (r) 751 if (r)
663 return r; 752 return r;
664 753
754 if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
755 nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
665 r = sm_ll_extend(&smm->ll, nr_blocks); 756 r = sm_ll_extend(&smm->ll, nr_blocks);
666 if (r) 757 if (r)
667 return r; 758 return r;
@@ -695,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
695 smm->begin = 0; 786 smm->begin = 0;
696 smm->recursion_count = 0; 787 smm->recursion_count = 0;
697 smm->allocated_this_transaction = 0; 788 smm->allocated_this_transaction = 0;
698 smm->nr_uncommitted = 0; 789 brb_init(&smm->uncommitted);
699 threshold_init(&smm->threshold); 790 threshold_init(&smm->threshold);
700 791
701 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); 792 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
index 39bba0801cf2..64df923974d8 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -9,6 +9,17 @@
9 9
10#include "dm-transaction-manager.h" 10#include "dm-transaction-manager.h"
11 11
12#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
13
14/*
15 * The metadata device is currently limited in size.
16 *
17 * We have one block of index, which can hold 255 index entries. Each
18 * index entry contains allocation info about ~16k metadata blocks.
19 */
20#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
21#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
22
12/* 23/*
13 * Unfortunately we have to use two-phase construction due to the cycle 24 * Unfortunately we have to use two-phase construction due to the cycle
14 * between the tm and sm. 25 * between the tm and sm.
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c4d420b7d2f4..407a99e46f69 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -501,10 +501,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
501 unsigned int chunk_sects, struct bio *bio) 501 unsigned int chunk_sects, struct bio *bio)
502{ 502{
503 if (likely(is_power_of_2(chunk_sects))) { 503 if (likely(is_power_of_2(chunk_sects))) {
504 return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) 504 return chunk_sects >=
505 ((bio->bi_iter.bi_sector & (chunk_sects-1))
505 + bio_sectors(bio)); 506 + bio_sectors(bio));
506 } else{ 507 } else{
507 sector_t sector = bio->bi_sector; 508 sector_t sector = bio->bi_iter.bi_sector;
508 return chunk_sects >= (sector_div(sector, chunk_sects) 509 return chunk_sects >= (sector_div(sector, chunk_sects)
509 + bio_sectors(bio)); 510 + bio_sectors(bio));
510 } 511 }
@@ -512,64 +513,44 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
512 513
513static void raid0_make_request(struct mddev *mddev, struct bio *bio) 514static void raid0_make_request(struct mddev *mddev, struct bio *bio)
514{ 515{
515 unsigned int chunk_sects;
516 sector_t sector_offset;
517 struct strip_zone *zone; 516 struct strip_zone *zone;
518 struct md_rdev *tmp_dev; 517 struct md_rdev *tmp_dev;
518 struct bio *split;
519 519
520 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 520 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
521 md_flush_request(mddev, bio); 521 md_flush_request(mddev, bio);
522 return; 522 return;
523 } 523 }
524 524
525 chunk_sects = mddev->chunk_sectors; 525 do {
526 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { 526 sector_t sector = bio->bi_iter.bi_sector;
527 sector_t sector = bio->bi_sector; 527 unsigned chunk_sects = mddev->chunk_sectors;
528 struct bio_pair *bp;
529 /* Sanity check -- queue functions should prevent this happening */
530 if (bio_segments(bio) > 1)
531 goto bad_map;
532 /* This is a one page bio that upper layers
533 * refuse to split for us, so we need to split it.
534 */
535 if (likely(is_power_of_2(chunk_sects)))
536 bp = bio_split(bio, chunk_sects - (sector &
537 (chunk_sects-1)));
538 else
539 bp = bio_split(bio, chunk_sects -
540 sector_div(sector, chunk_sects));
541 raid0_make_request(mddev, &bp->bio1);
542 raid0_make_request(mddev, &bp->bio2);
543 bio_pair_release(bp);
544 return;
545 }
546 528
547 sector_offset = bio->bi_sector; 529 unsigned sectors = chunk_sects -
548 zone = find_zone(mddev->private, &sector_offset); 530 (likely(is_power_of_2(chunk_sects))
549 tmp_dev = map_sector(mddev, zone, bio->bi_sector, 531 ? (sector & (chunk_sects-1))
550 &sector_offset); 532 : sector_div(sector, chunk_sects));
551 bio->bi_bdev = tmp_dev->bdev;
552 bio->bi_sector = sector_offset + zone->dev_start +
553 tmp_dev->data_offset;
554
555 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
556 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
557 /* Just ignore it */
558 bio_endio(bio, 0);
559 return;
560 }
561 533
562 generic_make_request(bio); 534 if (sectors < bio_sectors(bio)) {
563 return; 535 split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
564 536 bio_chain(split, bio);
565bad_map: 537 } else {
566 printk("md/raid0:%s: make_request bug: can't convert block across chunks" 538 split = bio;
567 " or bigger than %dk %llu %d\n", 539 }
568 mdname(mddev), chunk_sects / 2,
569 (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
570 540
571 bio_io_error(bio); 541 zone = find_zone(mddev->private, &sector);
572 return; 542 tmp_dev = map_sector(mddev, zone, sector, &sector);
543 split->bi_bdev = tmp_dev->bdev;
544 split->bi_iter.bi_sector = sector + zone->dev_start +
545 tmp_dev->data_offset;
546
547 if (unlikely((split->bi_rw & REQ_DISCARD) &&
548 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
549 /* Just ignore it */
550 bio_endio(split, 0);
551 } else
552 generic_make_request(split);
553 } while (split != bio);
573} 554}
574 555
575static void raid0_status(struct seq_file *seq, struct mddev *mddev) 556static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1e5a540995e9..4a6ca1cb2e78 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -229,7 +229,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
229 int done; 229 int done;
230 struct r1conf *conf = r1_bio->mddev->private; 230 struct r1conf *conf = r1_bio->mddev->private;
231 sector_t start_next_window = r1_bio->start_next_window; 231 sector_t start_next_window = r1_bio->start_next_window;
232 sector_t bi_sector = bio->bi_sector; 232 sector_t bi_sector = bio->bi_iter.bi_sector;
233 233
234 if (bio->bi_phys_segments) { 234 if (bio->bi_phys_segments) {
235 unsigned long flags; 235 unsigned long flags;
@@ -265,9 +265,8 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
265 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 265 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
266 pr_debug("raid1: sync end %s on sectors %llu-%llu\n", 266 pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
267 (bio_data_dir(bio) == WRITE) ? "write" : "read", 267 (bio_data_dir(bio) == WRITE) ? "write" : "read",
268 (unsigned long long) bio->bi_sector, 268 (unsigned long long) bio->bi_iter.bi_sector,
269 (unsigned long long) bio->bi_sector + 269 (unsigned long long) bio_end_sector(bio) - 1);
270 bio_sectors(bio) - 1);
271 270
272 call_bio_endio(r1_bio); 271 call_bio_endio(r1_bio);
273 } 272 }
@@ -466,9 +465,8 @@ static void raid1_end_write_request(struct bio *bio, int error)
466 struct bio *mbio = r1_bio->master_bio; 465 struct bio *mbio = r1_bio->master_bio;
467 pr_debug("raid1: behind end write sectors" 466 pr_debug("raid1: behind end write sectors"
468 " %llu-%llu\n", 467 " %llu-%llu\n",
469 (unsigned long long) mbio->bi_sector, 468 (unsigned long long) mbio->bi_iter.bi_sector,
470 (unsigned long long) mbio->bi_sector + 469 (unsigned long long) bio_end_sector(mbio) - 1);
471 bio_sectors(mbio) - 1);
472 call_bio_endio(r1_bio); 470 call_bio_endio(r1_bio);
473 } 471 }
474 } 472 }
@@ -875,7 +873,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
875 else if ((conf->next_resync - RESYNC_WINDOW_SECTORS 873 else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
876 >= bio_end_sector(bio)) || 874 >= bio_end_sector(bio)) ||
877 (conf->next_resync + NEXT_NORMALIO_DISTANCE 875 (conf->next_resync + NEXT_NORMALIO_DISTANCE
878 <= bio->bi_sector)) 876 <= bio->bi_iter.bi_sector))
879 wait = false; 877 wait = false;
880 else 878 else
881 wait = true; 879 wait = true;
@@ -913,20 +911,19 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
913 911
914 if (bio && bio_data_dir(bio) == WRITE) { 912 if (bio && bio_data_dir(bio) == WRITE) {
915 if (conf->next_resync + NEXT_NORMALIO_DISTANCE 913 if (conf->next_resync + NEXT_NORMALIO_DISTANCE
916 <= bio->bi_sector) { 914 <= bio->bi_iter.bi_sector) {
917 if (conf->start_next_window == MaxSector) 915 if (conf->start_next_window == MaxSector)
918 conf->start_next_window = 916 conf->start_next_window =
919 conf->next_resync + 917 conf->next_resync +
920 NEXT_NORMALIO_DISTANCE; 918 NEXT_NORMALIO_DISTANCE;
921 919
922 if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) 920 if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
923 <= bio->bi_sector) 921 <= bio->bi_iter.bi_sector)
924 conf->next_window_requests++; 922 conf->next_window_requests++;
925 else 923 else
926 conf->current_window_requests++; 924 conf->current_window_requests++;
927 }
928 if (bio->bi_sector >= conf->start_next_window)
929 sector = conf->start_next_window; 925 sector = conf->start_next_window;
926 }
930 } 927 }
931 928
932 conf->nr_pending++; 929 conf->nr_pending++;
@@ -1028,7 +1025,8 @@ do_sync_io:
1028 if (bvecs[i].bv_page) 1025 if (bvecs[i].bv_page)
1029 put_page(bvecs[i].bv_page); 1026 put_page(bvecs[i].bv_page);
1030 kfree(bvecs); 1027 kfree(bvecs);
1031 pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 1028 pr_debug("%dB behind alloc failed, doing sync I/O\n",
1029 bio->bi_iter.bi_size);
1032} 1030}
1033 1031
1034struct raid1_plug_cb { 1032struct raid1_plug_cb {
@@ -1108,7 +1106,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1108 1106
1109 if (bio_data_dir(bio) == WRITE && 1107 if (bio_data_dir(bio) == WRITE &&
1110 bio_end_sector(bio) > mddev->suspend_lo && 1108 bio_end_sector(bio) > mddev->suspend_lo &&
1111 bio->bi_sector < mddev->suspend_hi) { 1109 bio->bi_iter.bi_sector < mddev->suspend_hi) {
1112 /* As the suspend_* range is controlled by 1110 /* As the suspend_* range is controlled by
1113 * userspace, we want an interruptible 1111 * userspace, we want an interruptible
1114 * wait. 1112 * wait.
@@ -1119,7 +1117,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1119 prepare_to_wait(&conf->wait_barrier, 1117 prepare_to_wait(&conf->wait_barrier,
1120 &w, TASK_INTERRUPTIBLE); 1118 &w, TASK_INTERRUPTIBLE);
1121 if (bio_end_sector(bio) <= mddev->suspend_lo || 1119 if (bio_end_sector(bio) <= mddev->suspend_lo ||
1122 bio->bi_sector >= mddev->suspend_hi) 1120 bio->bi_iter.bi_sector >= mddev->suspend_hi)
1123 break; 1121 break;
1124 schedule(); 1122 schedule();
1125 } 1123 }
@@ -1141,7 +1139,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1141 r1_bio->sectors = bio_sectors(bio); 1139 r1_bio->sectors = bio_sectors(bio);
1142 r1_bio->state = 0; 1140 r1_bio->state = 0;
1143 r1_bio->mddev = mddev; 1141 r1_bio->mddev = mddev;
1144 r1_bio->sector = bio->bi_sector; 1142 r1_bio->sector = bio->bi_iter.bi_sector;
1145 1143
1146 /* We might need to issue multiple reads to different 1144 /* We might need to issue multiple reads to different
1147 * devices if there are bad blocks around, so we keep 1145 * devices if there are bad blocks around, so we keep
@@ -1181,12 +1179,13 @@ read_again:
1181 r1_bio->read_disk = rdisk; 1179 r1_bio->read_disk = rdisk;
1182 1180
1183 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1181 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1184 bio_trim(read_bio, r1_bio->sector - bio->bi_sector, 1182 bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
1185 max_sectors); 1183 max_sectors);
1186 1184
1187 r1_bio->bios[rdisk] = read_bio; 1185 r1_bio->bios[rdisk] = read_bio;
1188 1186
1189 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; 1187 read_bio->bi_iter.bi_sector = r1_bio->sector +
1188 mirror->rdev->data_offset;
1190 read_bio->bi_bdev = mirror->rdev->bdev; 1189 read_bio->bi_bdev = mirror->rdev->bdev;
1191 read_bio->bi_end_io = raid1_end_read_request; 1190 read_bio->bi_end_io = raid1_end_read_request;
1192 read_bio->bi_rw = READ | do_sync; 1191 read_bio->bi_rw = READ | do_sync;
@@ -1198,7 +1197,7 @@ read_again:
1198 */ 1197 */
1199 1198
1200 sectors_handled = (r1_bio->sector + max_sectors 1199 sectors_handled = (r1_bio->sector + max_sectors
1201 - bio->bi_sector); 1200 - bio->bi_iter.bi_sector);
1202 r1_bio->sectors = max_sectors; 1201 r1_bio->sectors = max_sectors;
1203 spin_lock_irq(&conf->device_lock); 1202 spin_lock_irq(&conf->device_lock);
1204 if (bio->bi_phys_segments == 0) 1203 if (bio->bi_phys_segments == 0)
@@ -1219,7 +1218,8 @@ read_again:
1219 r1_bio->sectors = bio_sectors(bio) - sectors_handled; 1218 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1220 r1_bio->state = 0; 1219 r1_bio->state = 0;
1221 r1_bio->mddev = mddev; 1220 r1_bio->mddev = mddev;
1222 r1_bio->sector = bio->bi_sector + sectors_handled; 1221 r1_bio->sector = bio->bi_iter.bi_sector +
1222 sectors_handled;
1223 goto read_again; 1223 goto read_again;
1224 } else 1224 } else
1225 generic_make_request(read_bio); 1225 generic_make_request(read_bio);
@@ -1322,7 +1322,7 @@ read_again:
1322 if (r1_bio->bios[j]) 1322 if (r1_bio->bios[j])
1323 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1323 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1324 r1_bio->state = 0; 1324 r1_bio->state = 0;
1325 allow_barrier(conf, start_next_window, bio->bi_sector); 1325 allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
1326 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1326 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1327 start_next_window = wait_barrier(conf, bio); 1327 start_next_window = wait_barrier(conf, bio);
1328 /* 1328 /*
@@ -1349,7 +1349,7 @@ read_again:
1349 bio->bi_phys_segments++; 1349 bio->bi_phys_segments++;
1350 spin_unlock_irq(&conf->device_lock); 1350 spin_unlock_irq(&conf->device_lock);
1351 } 1351 }
1352 sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; 1352 sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
1353 1353
1354 atomic_set(&r1_bio->remaining, 1); 1354 atomic_set(&r1_bio->remaining, 1);
1355 atomic_set(&r1_bio->behind_remaining, 0); 1355 atomic_set(&r1_bio->behind_remaining, 0);
@@ -1361,7 +1361,7 @@ read_again:
1361 continue; 1361 continue;
1362 1362
1363 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1363 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1364 bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors); 1364 bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
1365 1365
1366 if (first_clone) { 1366 if (first_clone) {
1367 /* do behind I/O ? 1367 /* do behind I/O ?
@@ -1395,7 +1395,7 @@ read_again:
1395 1395
1396 r1_bio->bios[i] = mbio; 1396 r1_bio->bios[i] = mbio;
1397 1397
1398 mbio->bi_sector = (r1_bio->sector + 1398 mbio->bi_iter.bi_sector = (r1_bio->sector +
1399 conf->mirrors[i].rdev->data_offset); 1399 conf->mirrors[i].rdev->data_offset);
1400 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1400 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1401 mbio->bi_end_io = raid1_end_write_request; 1401 mbio->bi_end_io = raid1_end_write_request;
@@ -1435,7 +1435,7 @@ read_again:
1435 r1_bio->sectors = bio_sectors(bio) - sectors_handled; 1435 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1436 r1_bio->state = 0; 1436 r1_bio->state = 0;
1437 r1_bio->mddev = mddev; 1437 r1_bio->mddev = mddev;
1438 r1_bio->sector = bio->bi_sector + sectors_handled; 1438 r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1439 goto retry_write; 1439 goto retry_write;
1440 } 1440 }
1441 1441
@@ -1953,20 +1953,24 @@ static int process_checks(struct r1bio *r1_bio)
1953 for (i = 0; i < conf->raid_disks * 2; i++) { 1953 for (i = 0; i < conf->raid_disks * 2; i++) {
1954 int j; 1954 int j;
1955 int size; 1955 int size;
1956 int uptodate;
1956 struct bio *b = r1_bio->bios[i]; 1957 struct bio *b = r1_bio->bios[i];
1957 if (b->bi_end_io != end_sync_read) 1958 if (b->bi_end_io != end_sync_read)
1958 continue; 1959 continue;
1959 /* fixup the bio for reuse */ 1960 /* fixup the bio for reuse, but preserve BIO_UPTODATE */
1961 uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
1960 bio_reset(b); 1962 bio_reset(b);
1963 if (!uptodate)
1964 clear_bit(BIO_UPTODATE, &b->bi_flags);
1961 b->bi_vcnt = vcnt; 1965 b->bi_vcnt = vcnt;
1962 b->bi_size = r1_bio->sectors << 9; 1966 b->bi_iter.bi_size = r1_bio->sectors << 9;
1963 b->bi_sector = r1_bio->sector + 1967 b->bi_iter.bi_sector = r1_bio->sector +
1964 conf->mirrors[i].rdev->data_offset; 1968 conf->mirrors[i].rdev->data_offset;
1965 b->bi_bdev = conf->mirrors[i].rdev->bdev; 1969 b->bi_bdev = conf->mirrors[i].rdev->bdev;
1966 b->bi_end_io = end_sync_read; 1970 b->bi_end_io = end_sync_read;
1967 b->bi_private = r1_bio; 1971 b->bi_private = r1_bio;
1968 1972
1969 size = b->bi_size; 1973 size = b->bi_iter.bi_size;
1970 for (j = 0; j < vcnt ; j++) { 1974 for (j = 0; j < vcnt ; j++) {
1971 struct bio_vec *bi; 1975 struct bio_vec *bi;
1972 bi = &b->bi_io_vec[j]; 1976 bi = &b->bi_io_vec[j];
@@ -1990,11 +1994,14 @@ static int process_checks(struct r1bio *r1_bio)
1990 int j; 1994 int j;
1991 struct bio *pbio = r1_bio->bios[primary]; 1995 struct bio *pbio = r1_bio->bios[primary];
1992 struct bio *sbio = r1_bio->bios[i]; 1996 struct bio *sbio = r1_bio->bios[i];
1997 int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
1993 1998
1994 if (sbio->bi_end_io != end_sync_read) 1999 if (sbio->bi_end_io != end_sync_read)
1995 continue; 2000 continue;
2001 /* Now we can 'fixup' the BIO_UPTODATE flag */
2002 set_bit(BIO_UPTODATE, &sbio->bi_flags);
1996 2003
1997 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { 2004 if (uptodate) {
1998 for (j = vcnt; j-- ; ) { 2005 for (j = vcnt; j-- ; ) {
1999 struct page *p, *s; 2006 struct page *p, *s;
2000 p = pbio->bi_io_vec[j].bv_page; 2007 p = pbio->bi_io_vec[j].bv_page;
@@ -2009,7 +2016,7 @@ static int process_checks(struct r1bio *r1_bio)
2009 if (j >= 0) 2016 if (j >= 0)
2010 atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); 2017 atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
2011 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) 2018 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
2012 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { 2019 && uptodate)) {
2013 /* No need to write to this device. */ 2020 /* No need to write to this device. */
2014 sbio->bi_end_io = NULL; 2021 sbio->bi_end_io = NULL;
2015 rdev_dec_pending(conf->mirrors[i].rdev, mddev); 2022 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@ -2221,11 +2228,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
2221 } 2228 }
2222 2229
2223 wbio->bi_rw = WRITE; 2230 wbio->bi_rw = WRITE;
2224 wbio->bi_sector = r1_bio->sector; 2231 wbio->bi_iter.bi_sector = r1_bio->sector;
2225 wbio->bi_size = r1_bio->sectors << 9; 2232 wbio->bi_iter.bi_size = r1_bio->sectors << 9;
2226 2233
2227 bio_trim(wbio, sector - r1_bio->sector, sectors); 2234 bio_trim(wbio, sector - r1_bio->sector, sectors);
2228 wbio->bi_sector += rdev->data_offset; 2235 wbio->bi_iter.bi_sector += rdev->data_offset;
2229 wbio->bi_bdev = rdev->bdev; 2236 wbio->bi_bdev = rdev->bdev;
2230 if (submit_bio_wait(WRITE, wbio) == 0) 2237 if (submit_bio_wait(WRITE, wbio) == 0)
2231 /* failure! */ 2238 /* failure! */
@@ -2339,7 +2346,8 @@ read_more:
2339 } 2346 }
2340 r1_bio->read_disk = disk; 2347 r1_bio->read_disk = disk;
2341 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); 2348 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
2342 bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors); 2349 bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
2350 max_sectors);
2343 r1_bio->bios[r1_bio->read_disk] = bio; 2351 r1_bio->bios[r1_bio->read_disk] = bio;
2344 rdev = conf->mirrors[disk].rdev; 2352 rdev = conf->mirrors[disk].rdev;
2345 printk_ratelimited(KERN_ERR 2353 printk_ratelimited(KERN_ERR
@@ -2348,7 +2356,7 @@ read_more:
2348 mdname(mddev), 2356 mdname(mddev),
2349 (unsigned long long)r1_bio->sector, 2357 (unsigned long long)r1_bio->sector,
2350 bdevname(rdev->bdev, b)); 2358 bdevname(rdev->bdev, b));
2351 bio->bi_sector = r1_bio->sector + rdev->data_offset; 2359 bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
2352 bio->bi_bdev = rdev->bdev; 2360 bio->bi_bdev = rdev->bdev;
2353 bio->bi_end_io = raid1_end_read_request; 2361 bio->bi_end_io = raid1_end_read_request;
2354 bio->bi_rw = READ | do_sync; 2362 bio->bi_rw = READ | do_sync;
@@ -2357,7 +2365,7 @@ read_more:
2357 /* Drat - have to split this up more */ 2365 /* Drat - have to split this up more */
2358 struct bio *mbio = r1_bio->master_bio; 2366 struct bio *mbio = r1_bio->master_bio;
2359 int sectors_handled = (r1_bio->sector + max_sectors 2367 int sectors_handled = (r1_bio->sector + max_sectors
2360 - mbio->bi_sector); 2368 - mbio->bi_iter.bi_sector);
2361 r1_bio->sectors = max_sectors; 2369 r1_bio->sectors = max_sectors;
2362 spin_lock_irq(&conf->device_lock); 2370 spin_lock_irq(&conf->device_lock);
2363 if (mbio->bi_phys_segments == 0) 2371 if (mbio->bi_phys_segments == 0)
@@ -2375,7 +2383,8 @@ read_more:
2375 r1_bio->state = 0; 2383 r1_bio->state = 0;
2376 set_bit(R1BIO_ReadError, &r1_bio->state); 2384 set_bit(R1BIO_ReadError, &r1_bio->state);
2377 r1_bio->mddev = mddev; 2385 r1_bio->mddev = mddev;
2378 r1_bio->sector = mbio->bi_sector + sectors_handled; 2386 r1_bio->sector = mbio->bi_iter.bi_sector +
2387 sectors_handled;
2379 2388
2380 goto read_more; 2389 goto read_more;
2381 } else 2390 } else
@@ -2599,7 +2608,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2599 } 2608 }
2600 if (bio->bi_end_io) { 2609 if (bio->bi_end_io) {
2601 atomic_inc(&rdev->nr_pending); 2610 atomic_inc(&rdev->nr_pending);
2602 bio->bi_sector = sector_nr + rdev->data_offset; 2611 bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
2603 bio->bi_bdev = rdev->bdev; 2612 bio->bi_bdev = rdev->bdev;
2604 bio->bi_private = r1_bio; 2613 bio->bi_private = r1_bio;
2605 } 2614 }
@@ -2699,7 +2708,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2699 continue; 2708 continue;
2700 /* remove last page from this bio */ 2709 /* remove last page from this bio */
2701 bio->bi_vcnt--; 2710 bio->bi_vcnt--;
2702 bio->bi_size -= len; 2711 bio->bi_iter.bi_size -= len;
2703 bio->bi_flags &= ~(1<< BIO_SEG_VALID); 2712 bio->bi_flags &= ~(1<< BIO_SEG_VALID);
2704 } 2713 }
2705 goto bio_full; 2714 goto bio_full;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c504e8389e69..33fc408e5eac 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1152,14 +1152,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1152 kfree(plug); 1152 kfree(plug);
1153} 1153}
1154 1154
1155static void make_request(struct mddev *mddev, struct bio * bio) 1155static void __make_request(struct mddev *mddev, struct bio *bio)
1156{ 1156{
1157 struct r10conf *conf = mddev->private; 1157 struct r10conf *conf = mddev->private;
1158 struct r10bio *r10_bio; 1158 struct r10bio *r10_bio;
1159 struct bio *read_bio; 1159 struct bio *read_bio;
1160 int i; 1160 int i;
1161 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1162 int chunk_sects = chunk_mask + 1;
1163 const int rw = bio_data_dir(bio); 1161 const int rw = bio_data_dir(bio);
1164 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1162 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1165 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1163 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
@@ -1174,88 +1172,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1174 int max_sectors; 1172 int max_sectors;
1175 int sectors; 1173 int sectors;
1176 1174
1177 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1178 md_flush_request(mddev, bio);
1179 return;
1180 }
1181
1182 /* If this request crosses a chunk boundary, we need to
1183 * split it. This will only happen for 1 PAGE (or less) requests.
1184 */
1185 if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
1186 > chunk_sects
1187 && (conf->geo.near_copies < conf->geo.raid_disks
1188 || conf->prev.near_copies < conf->prev.raid_disks))) {
1189 struct bio_pair *bp;
1190 /* Sanity check -- queue functions should prevent this happening */
1191 if (bio_segments(bio) > 1)
1192 goto bad_map;
1193 /* This is a one page bio that upper layers
1194 * refuse to split for us, so we need to split it.
1195 */
1196 bp = bio_split(bio,
1197 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1198
1199 /* Each of these 'make_request' calls will call 'wait_barrier'.
1200 * If the first succeeds but the second blocks due to the resync
1201 * thread raising the barrier, we will deadlock because the
1202 * IO to the underlying device will be queued in generic_make_request
1203 * and will never complete, so will never reduce nr_pending.
1204 * So increment nr_waiting here so no new raise_barriers will
1205 * succeed, and so the second wait_barrier cannot block.
1206 */
1207 spin_lock_irq(&conf->resync_lock);
1208 conf->nr_waiting++;
1209 spin_unlock_irq(&conf->resync_lock);
1210
1211 make_request(mddev, &bp->bio1);
1212 make_request(mddev, &bp->bio2);
1213
1214 spin_lock_irq(&conf->resync_lock);
1215 conf->nr_waiting--;
1216 wake_up(&conf->wait_barrier);
1217 spin_unlock_irq(&conf->resync_lock);
1218
1219 bio_pair_release(bp);
1220 return;
1221 bad_map:
1222 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1223 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1224 (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
1225
1226 bio_io_error(bio);
1227 return;
1228 }
1229
1230 md_write_start(mddev, bio);
1231
1232 /*
1233 * Register the new request and wait if the reconstruction
1234 * thread has put up a bar for new requests.
1235 * Continue immediately if no resync is active currently.
1236 */
1237 wait_barrier(conf);
1238
1239 sectors = bio_sectors(bio); 1175 sectors = bio_sectors(bio);
1240 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1176 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1241 bio->bi_sector < conf->reshape_progress && 1177 bio->bi_iter.bi_sector < conf->reshape_progress &&
1242 bio->bi_sector + sectors > conf->reshape_progress) { 1178 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1243 /* IO spans the reshape position. Need to wait for 1179 /* IO spans the reshape position. Need to wait for
1244 * reshape to pass 1180 * reshape to pass
1245 */ 1181 */
1246 allow_barrier(conf); 1182 allow_barrier(conf);
1247 wait_event(conf->wait_barrier, 1183 wait_event(conf->wait_barrier,
1248 conf->reshape_progress <= bio->bi_sector || 1184 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1249 conf->reshape_progress >= bio->bi_sector + sectors); 1185 conf->reshape_progress >= bio->bi_iter.bi_sector +
1186 sectors);
1250 wait_barrier(conf); 1187 wait_barrier(conf);
1251 } 1188 }
1252 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1189 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1253 bio_data_dir(bio) == WRITE && 1190 bio_data_dir(bio) == WRITE &&
1254 (mddev->reshape_backwards 1191 (mddev->reshape_backwards
1255 ? (bio->bi_sector < conf->reshape_safe && 1192 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1256 bio->bi_sector + sectors > conf->reshape_progress) 1193 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1257 : (bio->bi_sector + sectors > conf->reshape_safe && 1194 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1258 bio->bi_sector < conf->reshape_progress))) { 1195 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1259 /* Need to update reshape_position in metadata */ 1196 /* Need to update reshape_position in metadata */
1260 mddev->reshape_position = conf->reshape_progress; 1197 mddev->reshape_position = conf->reshape_progress;
1261 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1198 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1273,7 +1210,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1273 r10_bio->sectors = sectors; 1210 r10_bio->sectors = sectors;
1274 1211
1275 r10_bio->mddev = mddev; 1212 r10_bio->mddev = mddev;
1276 r10_bio->sector = bio->bi_sector; 1213 r10_bio->sector = bio->bi_iter.bi_sector;
1277 r10_bio->state = 0; 1214 r10_bio->state = 0;
1278 1215
1279 /* We might need to issue multiple reads to different 1216 /* We might need to issue multiple reads to different
@@ -1302,13 +1239,13 @@ read_again:
1302 slot = r10_bio->read_slot; 1239 slot = r10_bio->read_slot;
1303 1240
1304 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1241 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1305 bio_trim(read_bio, r10_bio->sector - bio->bi_sector, 1242 bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
1306 max_sectors); 1243 max_sectors);
1307 1244
1308 r10_bio->devs[slot].bio = read_bio; 1245 r10_bio->devs[slot].bio = read_bio;
1309 r10_bio->devs[slot].rdev = rdev; 1246 r10_bio->devs[slot].rdev = rdev;
1310 1247
1311 read_bio->bi_sector = r10_bio->devs[slot].addr + 1248 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1312 choose_data_offset(r10_bio, rdev); 1249 choose_data_offset(r10_bio, rdev);
1313 read_bio->bi_bdev = rdev->bdev; 1250 read_bio->bi_bdev = rdev->bdev;
1314 read_bio->bi_end_io = raid10_end_read_request; 1251 read_bio->bi_end_io = raid10_end_read_request;
@@ -1319,15 +1256,15 @@ read_again:
1319 /* Could not read all from this device, so we will 1256 /* Could not read all from this device, so we will
1320 * need another r10_bio. 1257 * need another r10_bio.
1321 */ 1258 */
1322 sectors_handled = (r10_bio->sectors + max_sectors 1259 sectors_handled = (r10_bio->sector + max_sectors
1323 - bio->bi_sector); 1260 - bio->bi_iter.bi_sector);
1324 r10_bio->sectors = max_sectors; 1261 r10_bio->sectors = max_sectors;
1325 spin_lock_irq(&conf->device_lock); 1262 spin_lock_irq(&conf->device_lock);
1326 if (bio->bi_phys_segments == 0) 1263 if (bio->bi_phys_segments == 0)
1327 bio->bi_phys_segments = 2; 1264 bio->bi_phys_segments = 2;
1328 else 1265 else
1329 bio->bi_phys_segments++; 1266 bio->bi_phys_segments++;
1330 spin_unlock(&conf->device_lock); 1267 spin_unlock_irq(&conf->device_lock);
1331 /* Cannot call generic_make_request directly 1268 /* Cannot call generic_make_request directly
1332 * as that will be queued in __generic_make_request 1269 * as that will be queued in __generic_make_request
1333 * and subsequent mempool_alloc might block 1270 * and subsequent mempool_alloc might block
@@ -1341,7 +1278,8 @@ read_again:
1341 r10_bio->sectors = bio_sectors(bio) - sectors_handled; 1278 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1342 r10_bio->state = 0; 1279 r10_bio->state = 0;
1343 r10_bio->mddev = mddev; 1280 r10_bio->mddev = mddev;
1344 r10_bio->sector = bio->bi_sector + sectors_handled; 1281 r10_bio->sector = bio->bi_iter.bi_sector +
1282 sectors_handled;
1345 goto read_again; 1283 goto read_again;
1346 } else 1284 } else
1347 generic_make_request(read_bio); 1285 generic_make_request(read_bio);
@@ -1499,7 +1437,8 @@ retry_write:
1499 bio->bi_phys_segments++; 1437 bio->bi_phys_segments++;
1500 spin_unlock_irq(&conf->device_lock); 1438 spin_unlock_irq(&conf->device_lock);
1501 } 1439 }
1502 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; 1440 sectors_handled = r10_bio->sector + max_sectors -
1441 bio->bi_iter.bi_sector;
1503 1442
1504 atomic_set(&r10_bio->remaining, 1); 1443 atomic_set(&r10_bio->remaining, 1);
1505 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1444 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
@@ -1510,11 +1449,11 @@ retry_write:
1510 if (r10_bio->devs[i].bio) { 1449 if (r10_bio->devs[i].bio) {
1511 struct md_rdev *rdev = conf->mirrors[d].rdev; 1450 struct md_rdev *rdev = conf->mirrors[d].rdev;
1512 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1451 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1513 bio_trim(mbio, r10_bio->sector - bio->bi_sector, 1452 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1514 max_sectors); 1453 max_sectors);
1515 r10_bio->devs[i].bio = mbio; 1454 r10_bio->devs[i].bio = mbio;
1516 1455
1517 mbio->bi_sector = (r10_bio->devs[i].addr+ 1456 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
1518 choose_data_offset(r10_bio, 1457 choose_data_offset(r10_bio,
1519 rdev)); 1458 rdev));
1520 mbio->bi_bdev = rdev->bdev; 1459 mbio->bi_bdev = rdev->bdev;
@@ -1553,11 +1492,11 @@ retry_write:
1553 rdev = conf->mirrors[d].rdev; 1492 rdev = conf->mirrors[d].rdev;
1554 } 1493 }
1555 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1494 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1556 bio_trim(mbio, r10_bio->sector - bio->bi_sector, 1495 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1557 max_sectors); 1496 max_sectors);
1558 r10_bio->devs[i].repl_bio = mbio; 1497 r10_bio->devs[i].repl_bio = mbio;
1559 1498
1560 mbio->bi_sector = (r10_bio->devs[i].addr + 1499 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
1561 choose_data_offset( 1500 choose_data_offset(
1562 r10_bio, rdev)); 1501 r10_bio, rdev));
1563 mbio->bi_bdev = rdev->bdev; 1502 mbio->bi_bdev = rdev->bdev;
@@ -1591,11 +1530,57 @@ retry_write:
1591 r10_bio->sectors = bio_sectors(bio) - sectors_handled; 1530 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1592 1531
1593 r10_bio->mddev = mddev; 1532 r10_bio->mddev = mddev;
1594 r10_bio->sector = bio->bi_sector + sectors_handled; 1533 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1595 r10_bio->state = 0; 1534 r10_bio->state = 0;
1596 goto retry_write; 1535 goto retry_write;
1597 } 1536 }
1598 one_write_done(r10_bio); 1537 one_write_done(r10_bio);
1538}
1539
1540static void make_request(struct mddev *mddev, struct bio *bio)
1541{
1542 struct r10conf *conf = mddev->private;
1543 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1544 int chunk_sects = chunk_mask + 1;
1545
1546 struct bio *split;
1547
1548 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1549 md_flush_request(mddev, bio);
1550 return;
1551 }
1552
1553 md_write_start(mddev, bio);
1554
1555 /*
1556 * Register the new request and wait if the reconstruction
1557 * thread has put up a bar for new requests.
1558 * Continue immediately if no resync is active currently.
1559 */
1560 wait_barrier(conf);
1561
1562 do {
1563
1564 /*
1565 * If this request crosses a chunk boundary, we need to split
1566 * it.
1567 */
1568 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1569 bio_sectors(bio) > chunk_sects
1570 && (conf->geo.near_copies < conf->geo.raid_disks
1571 || conf->prev.near_copies <
1572 conf->prev.raid_disks))) {
1573 split = bio_split(bio, chunk_sects -
1574 (bio->bi_iter.bi_sector &
1575 (chunk_sects - 1)),
1576 GFP_NOIO, fs_bio_set);
1577 bio_chain(split, bio);
1578 } else {
1579 split = bio;
1580 }
1581
1582 __make_request(mddev, split);
1583 } while (split != bio);
1599 1584
1600 /* In case raid10d snuck in to freeze_array */ 1585 /* In case raid10d snuck in to freeze_array */
1601 wake_up(&conf->wait_barrier); 1586 wake_up(&conf->wait_barrier);
@@ -2124,10 +2109,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2124 bio_reset(tbio); 2109 bio_reset(tbio);
2125 2110
2126 tbio->bi_vcnt = vcnt; 2111 tbio->bi_vcnt = vcnt;
2127 tbio->bi_size = r10_bio->sectors << 9; 2112 tbio->bi_iter.bi_size = r10_bio->sectors << 9;
2128 tbio->bi_rw = WRITE; 2113 tbio->bi_rw = WRITE;
2129 tbio->bi_private = r10_bio; 2114 tbio->bi_private = r10_bio;
2130 tbio->bi_sector = r10_bio->devs[i].addr; 2115 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2131 2116
2132 for (j=0; j < vcnt ; j++) { 2117 for (j=0; j < vcnt ; j++) {
2133 tbio->bi_io_vec[j].bv_offset = 0; 2118 tbio->bi_io_vec[j].bv_offset = 0;
@@ -2144,7 +2129,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2144 atomic_inc(&r10_bio->remaining); 2129 atomic_inc(&r10_bio->remaining);
2145 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); 2130 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2146 2131
2147 tbio->bi_sector += conf->mirrors[d].rdev->data_offset; 2132 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2148 tbio->bi_bdev = conf->mirrors[d].rdev->bdev; 2133 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2149 generic_make_request(tbio); 2134 generic_make_request(tbio);
2150 } 2135 }
@@ -2614,8 +2599,8 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2614 sectors = sect_to_write; 2599 sectors = sect_to_write;
2615 /* Write at 'sector' for 'sectors' */ 2600 /* Write at 'sector' for 'sectors' */
2616 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 2601 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2617 bio_trim(wbio, sector - bio->bi_sector, sectors); 2602 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2618 wbio->bi_sector = (r10_bio->devs[i].addr+ 2603 wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
2619 choose_data_offset(r10_bio, rdev) + 2604 choose_data_offset(r10_bio, rdev) +
2620 (sector - r10_bio->sector)); 2605 (sector - r10_bio->sector));
2621 wbio->bi_bdev = rdev->bdev; 2606 wbio->bi_bdev = rdev->bdev;
@@ -2687,10 +2672,10 @@ read_more:
2687 (unsigned long long)r10_bio->sector); 2672 (unsigned long long)r10_bio->sector);
2688 bio = bio_clone_mddev(r10_bio->master_bio, 2673 bio = bio_clone_mddev(r10_bio->master_bio,
2689 GFP_NOIO, mddev); 2674 GFP_NOIO, mddev);
2690 bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors); 2675 bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
2691 r10_bio->devs[slot].bio = bio; 2676 r10_bio->devs[slot].bio = bio;
2692 r10_bio->devs[slot].rdev = rdev; 2677 r10_bio->devs[slot].rdev = rdev;
2693 bio->bi_sector = r10_bio->devs[slot].addr 2678 bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
2694 + choose_data_offset(r10_bio, rdev); 2679 + choose_data_offset(r10_bio, rdev);
2695 bio->bi_bdev = rdev->bdev; 2680 bio->bi_bdev = rdev->bdev;
2696 bio->bi_rw = READ | do_sync; 2681 bio->bi_rw = READ | do_sync;
@@ -2701,7 +2686,7 @@ read_more:
2701 struct bio *mbio = r10_bio->master_bio; 2686 struct bio *mbio = r10_bio->master_bio;
2702 int sectors_handled = 2687 int sectors_handled =
2703 r10_bio->sector + max_sectors 2688 r10_bio->sector + max_sectors
2704 - mbio->bi_sector; 2689 - mbio->bi_iter.bi_sector;
2705 r10_bio->sectors = max_sectors; 2690 r10_bio->sectors = max_sectors;
2706 spin_lock_irq(&conf->device_lock); 2691 spin_lock_irq(&conf->device_lock);
2707 if (mbio->bi_phys_segments == 0) 2692 if (mbio->bi_phys_segments == 0)
@@ -2719,7 +2704,7 @@ read_more:
2719 set_bit(R10BIO_ReadError, 2704 set_bit(R10BIO_ReadError,
2720 &r10_bio->state); 2705 &r10_bio->state);
2721 r10_bio->mddev = mddev; 2706 r10_bio->mddev = mddev;
2722 r10_bio->sector = mbio->bi_sector 2707 r10_bio->sector = mbio->bi_iter.bi_sector
2723 + sectors_handled; 2708 + sectors_handled;
2724 2709
2725 goto read_more; 2710 goto read_more;
@@ -3157,7 +3142,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3157 bio->bi_end_io = end_sync_read; 3142 bio->bi_end_io = end_sync_read;
3158 bio->bi_rw = READ; 3143 bio->bi_rw = READ;
3159 from_addr = r10_bio->devs[j].addr; 3144 from_addr = r10_bio->devs[j].addr;
3160 bio->bi_sector = from_addr + rdev->data_offset; 3145 bio->bi_iter.bi_sector = from_addr +
3146 rdev->data_offset;
3161 bio->bi_bdev = rdev->bdev; 3147 bio->bi_bdev = rdev->bdev;
3162 atomic_inc(&rdev->nr_pending); 3148 atomic_inc(&rdev->nr_pending);
3163 /* and we write to 'i' (if not in_sync) */ 3149 /* and we write to 'i' (if not in_sync) */
@@ -3181,7 +3167,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3181 bio->bi_private = r10_bio; 3167 bio->bi_private = r10_bio;
3182 bio->bi_end_io = end_sync_write; 3168 bio->bi_end_io = end_sync_write;
3183 bio->bi_rw = WRITE; 3169 bio->bi_rw = WRITE;
3184 bio->bi_sector = to_addr 3170 bio->bi_iter.bi_sector = to_addr
3185 + rdev->data_offset; 3171 + rdev->data_offset;
3186 bio->bi_bdev = rdev->bdev; 3172 bio->bi_bdev = rdev->bdev;
3187 atomic_inc(&r10_bio->remaining); 3173 atomic_inc(&r10_bio->remaining);
@@ -3210,7 +3196,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3210 bio->bi_private = r10_bio; 3196 bio->bi_private = r10_bio;
3211 bio->bi_end_io = end_sync_write; 3197 bio->bi_end_io = end_sync_write;
3212 bio->bi_rw = WRITE; 3198 bio->bi_rw = WRITE;
3213 bio->bi_sector = to_addr + rdev->data_offset; 3199 bio->bi_iter.bi_sector = to_addr +
3200 rdev->data_offset;
3214 bio->bi_bdev = rdev->bdev; 3201 bio->bi_bdev = rdev->bdev;
3215 atomic_inc(&r10_bio->remaining); 3202 atomic_inc(&r10_bio->remaining);
3216 break; 3203 break;
@@ -3218,10 +3205,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3218 if (j == conf->copies) { 3205 if (j == conf->copies) {
3219 /* Cannot recover, so abort the recovery or 3206 /* Cannot recover, so abort the recovery or
3220 * record a bad block */ 3207 * record a bad block */
3221 put_buf(r10_bio);
3222 if (rb2)
3223 atomic_dec(&rb2->remaining);
3224 r10_bio = rb2;
3225 if (any_working) { 3208 if (any_working) {
3226 /* problem is that there are bad blocks 3209 /* problem is that there are bad blocks
3227 * on other device(s) 3210 * on other device(s)
@@ -3253,6 +3236,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3253 mirror->recovery_disabled 3236 mirror->recovery_disabled
3254 = mddev->recovery_disabled; 3237 = mddev->recovery_disabled;
3255 } 3238 }
3239 put_buf(r10_bio);
3240 if (rb2)
3241 atomic_dec(&rb2->remaining);
3242 r10_bio = rb2;
3256 break; 3243 break;
3257 } 3244 }
3258 } 3245 }
@@ -3328,7 +3315,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3328 bio->bi_private = r10_bio; 3315 bio->bi_private = r10_bio;
3329 bio->bi_end_io = end_sync_read; 3316 bio->bi_end_io = end_sync_read;
3330 bio->bi_rw = READ; 3317 bio->bi_rw = READ;
3331 bio->bi_sector = sector + 3318 bio->bi_iter.bi_sector = sector +
3332 conf->mirrors[d].rdev->data_offset; 3319 conf->mirrors[d].rdev->data_offset;
3333 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 3320 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3334 count++; 3321 count++;
@@ -3350,7 +3337,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3350 bio->bi_private = r10_bio; 3337 bio->bi_private = r10_bio;
3351 bio->bi_end_io = end_sync_write; 3338 bio->bi_end_io = end_sync_write;
3352 bio->bi_rw = WRITE; 3339 bio->bi_rw = WRITE;
3353 bio->bi_sector = sector + 3340 bio->bi_iter.bi_sector = sector +
3354 conf->mirrors[d].replacement->data_offset; 3341 conf->mirrors[d].replacement->data_offset;
3355 bio->bi_bdev = conf->mirrors[d].replacement->bdev; 3342 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3356 count++; 3343 count++;
@@ -3397,7 +3384,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3397 bio2 = bio2->bi_next) { 3384 bio2 = bio2->bi_next) {
3398 /* remove last page from this bio */ 3385 /* remove last page from this bio */
3399 bio2->bi_vcnt--; 3386 bio2->bi_vcnt--;
3400 bio2->bi_size -= len; 3387 bio2->bi_iter.bi_size -= len;
3401 bio2->bi_flags &= ~(1<< BIO_SEG_VALID); 3388 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3402 } 3389 }
3403 goto bio_full; 3390 goto bio_full;
@@ -3747,7 +3734,8 @@ static int run(struct mddev *mddev)
3747 !test_bit(In_sync, &disk->rdev->flags)) { 3734 !test_bit(In_sync, &disk->rdev->flags)) {
3748 disk->head_position = 0; 3735 disk->head_position = 0;
3749 mddev->degraded++; 3736 mddev->degraded++;
3750 if (disk->rdev) 3737 if (disk->rdev &&
3738 disk->rdev->saved_raid_disk < 0)
3751 conf->fullsync = 1; 3739 conf->fullsync = 1;
3752 } 3740 }
3753 disk->recovery_disabled = mddev->recovery_disabled - 1; 3741 disk->recovery_disabled = mddev->recovery_disabled - 1;
@@ -4417,7 +4405,7 @@ read_more:
4417 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); 4405 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4418 4406
4419 read_bio->bi_bdev = rdev->bdev; 4407 read_bio->bi_bdev = rdev->bdev;
4420 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr 4408 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4421 + rdev->data_offset); 4409 + rdev->data_offset);
4422 read_bio->bi_private = r10_bio; 4410 read_bio->bi_private = r10_bio;
4423 read_bio->bi_end_io = end_sync_read; 4411 read_bio->bi_end_io = end_sync_read;
@@ -4425,7 +4413,7 @@ read_more:
4425 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); 4413 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4426 read_bio->bi_flags |= 1 << BIO_UPTODATE; 4414 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4427 read_bio->bi_vcnt = 0; 4415 read_bio->bi_vcnt = 0;
4428 read_bio->bi_size = 0; 4416 read_bio->bi_iter.bi_size = 0;
4429 r10_bio->master_bio = read_bio; 4417 r10_bio->master_bio = read_bio;
4430 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 4418 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4431 4419
@@ -4451,7 +4439,8 @@ read_more:
4451 4439
4452 bio_reset(b); 4440 bio_reset(b);
4453 b->bi_bdev = rdev2->bdev; 4441 b->bi_bdev = rdev2->bdev;
4454 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; 4442 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4443 rdev2->new_data_offset;
4455 b->bi_private = r10_bio; 4444 b->bi_private = r10_bio;
4456 b->bi_end_io = end_reshape_write; 4445 b->bi_end_io = end_reshape_write;
4457 b->bi_rw = WRITE; 4446 b->bi_rw = WRITE;
@@ -4478,7 +4467,7 @@ read_more:
4478 bio2 = bio2->bi_next) { 4467 bio2 = bio2->bi_next) {
4479 /* Remove last page from this bio */ 4468 /* Remove last page from this bio */
4480 bio2->bi_vcnt--; 4469 bio2->bi_vcnt--;
4481 bio2->bi_size -= len; 4470 bio2->bi_iter.bi_size -= len;
4482 bio2->bi_flags &= ~(1<<BIO_SEG_VALID); 4471 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4483 } 4472 }
4484 goto bio_full; 4473 goto bio_full;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cc055da02e2a..16f5c21963db 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -133,7 +133,7 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
133static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 133static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
134{ 134{
135 int sectors = bio_sectors(bio); 135 int sectors = bio_sectors(bio);
136 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 136 if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
137 return bio->bi_next; 137 return bio->bi_next;
138 else 138 else
139 return NULL; 139 return NULL;
@@ -225,7 +225,7 @@ static void return_io(struct bio *return_bi)
225 225
226 return_bi = bi->bi_next; 226 return_bi = bi->bi_next;
227 bi->bi_next = NULL; 227 bi->bi_next = NULL;
228 bi->bi_size = 0; 228 bi->bi_iter.bi_size = 0;
229 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 229 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
230 bi, 0); 230 bi, 0);
231 bio_endio(bi, 0); 231 bio_endio(bi, 0);
@@ -675,8 +675,10 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
675 || !conf->inactive_blocked), 675 || !conf->inactive_blocked),
676 *(conf->hash_locks + hash)); 676 *(conf->hash_locks + hash));
677 conf->inactive_blocked = 0; 677 conf->inactive_blocked = 0;
678 } else 678 } else {
679 init_stripe(sh, sector, previous); 679 init_stripe(sh, sector, previous);
680 atomic_inc(&sh->count);
681 }
680 } else { 682 } else {
681 spin_lock(&conf->device_lock); 683 spin_lock(&conf->device_lock);
682 if (atomic_read(&sh->count)) { 684 if (atomic_read(&sh->count)) {
@@ -687,20 +689,19 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
687 } else { 689 } else {
688 if (!test_bit(STRIPE_HANDLE, &sh->state)) 690 if (!test_bit(STRIPE_HANDLE, &sh->state))
689 atomic_inc(&conf->active_stripes); 691 atomic_inc(&conf->active_stripes);
690 BUG_ON(list_empty(&sh->lru)); 692 BUG_ON(list_empty(&sh->lru) &&
693 !test_bit(STRIPE_EXPANDING, &sh->state));
691 list_del_init(&sh->lru); 694 list_del_init(&sh->lru);
692 if (sh->group) { 695 if (sh->group) {
693 sh->group->stripes_cnt--; 696 sh->group->stripes_cnt--;
694 sh->group = NULL; 697 sh->group = NULL;
695 } 698 }
696 } 699 }
700 atomic_inc(&sh->count);
697 spin_unlock(&conf->device_lock); 701 spin_unlock(&conf->device_lock);
698 } 702 }
699 } while (sh == NULL); 703 } while (sh == NULL);
700 704
701 if (sh)
702 atomic_inc(&sh->count);
703
704 spin_unlock_irq(conf->hash_locks + hash); 705 spin_unlock_irq(conf->hash_locks + hash);
705 return sh; 706 return sh;
706} 707}
@@ -851,10 +852,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
851 bi->bi_rw, i); 852 bi->bi_rw, i);
852 atomic_inc(&sh->count); 853 atomic_inc(&sh->count);
853 if (use_new_offset(conf, sh)) 854 if (use_new_offset(conf, sh))
854 bi->bi_sector = (sh->sector 855 bi->bi_iter.bi_sector = (sh->sector
855 + rdev->new_data_offset); 856 + rdev->new_data_offset);
856 else 857 else
857 bi->bi_sector = (sh->sector 858 bi->bi_iter.bi_sector = (sh->sector
858 + rdev->data_offset); 859 + rdev->data_offset);
859 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 860 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
860 bi->bi_rw |= REQ_NOMERGE; 861 bi->bi_rw |= REQ_NOMERGE;
@@ -862,7 +863,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
862 bi->bi_vcnt = 1; 863 bi->bi_vcnt = 1;
863 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 864 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
864 bi->bi_io_vec[0].bv_offset = 0; 865 bi->bi_io_vec[0].bv_offset = 0;
865 bi->bi_size = STRIPE_SIZE; 866 bi->bi_iter.bi_size = STRIPE_SIZE;
866 /* 867 /*
867 * If this is discard request, set bi_vcnt 0. We don't 868 * If this is discard request, set bi_vcnt 0. We don't
868 * want to confuse SCSI because SCSI will replace payload 869 * want to confuse SCSI because SCSI will replace payload
@@ -898,15 +899,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
898 rbi->bi_rw, i); 899 rbi->bi_rw, i);
899 atomic_inc(&sh->count); 900 atomic_inc(&sh->count);
900 if (use_new_offset(conf, sh)) 901 if (use_new_offset(conf, sh))
901 rbi->bi_sector = (sh->sector 902 rbi->bi_iter.bi_sector = (sh->sector
902 + rrdev->new_data_offset); 903 + rrdev->new_data_offset);
903 else 904 else
904 rbi->bi_sector = (sh->sector 905 rbi->bi_iter.bi_sector = (sh->sector
905 + rrdev->data_offset); 906 + rrdev->data_offset);
906 rbi->bi_vcnt = 1; 907 rbi->bi_vcnt = 1;
907 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 908 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
908 rbi->bi_io_vec[0].bv_offset = 0; 909 rbi->bi_io_vec[0].bv_offset = 0;
909 rbi->bi_size = STRIPE_SIZE; 910 rbi->bi_iter.bi_size = STRIPE_SIZE;
910 /* 911 /*
911 * If this is discard request, set bi_vcnt 0. We don't 912 * If this is discard request, set bi_vcnt 0. We don't
912 * want to confuse SCSI because SCSI will replace payload 913 * want to confuse SCSI because SCSI will replace payload
@@ -934,24 +935,24 @@ static struct dma_async_tx_descriptor *
934async_copy_data(int frombio, struct bio *bio, struct page *page, 935async_copy_data(int frombio, struct bio *bio, struct page *page,
935 sector_t sector, struct dma_async_tx_descriptor *tx) 936 sector_t sector, struct dma_async_tx_descriptor *tx)
936{ 937{
937 struct bio_vec *bvl; 938 struct bio_vec bvl;
939 struct bvec_iter iter;
938 struct page *bio_page; 940 struct page *bio_page;
939 int i;
940 int page_offset; 941 int page_offset;
941 struct async_submit_ctl submit; 942 struct async_submit_ctl submit;
942 enum async_tx_flags flags = 0; 943 enum async_tx_flags flags = 0;
943 944
944 if (bio->bi_sector >= sector) 945 if (bio->bi_iter.bi_sector >= sector)
945 page_offset = (signed)(bio->bi_sector - sector) * 512; 946 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
946 else 947 else
947 page_offset = (signed)(sector - bio->bi_sector) * -512; 948 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
948 949
949 if (frombio) 950 if (frombio)
950 flags |= ASYNC_TX_FENCE; 951 flags |= ASYNC_TX_FENCE;
951 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 952 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
952 953
953 bio_for_each_segment(bvl, bio, i) { 954 bio_for_each_segment(bvl, bio, iter) {
954 int len = bvl->bv_len; 955 int len = bvl.bv_len;
955 int clen; 956 int clen;
956 int b_offset = 0; 957 int b_offset = 0;
957 958
@@ -967,8 +968,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
967 clen = len; 968 clen = len;
968 969
969 if (clen > 0) { 970 if (clen > 0) {
970 b_offset += bvl->bv_offset; 971 b_offset += bvl.bv_offset;
971 bio_page = bvl->bv_page; 972 bio_page = bvl.bv_page;
972 if (frombio) 973 if (frombio)
973 tx = async_memcpy(page, bio_page, page_offset, 974 tx = async_memcpy(page, bio_page, page_offset,
974 b_offset, clen, &submit); 975 b_offset, clen, &submit);
@@ -1011,7 +1012,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
1011 BUG_ON(!dev->read); 1012 BUG_ON(!dev->read);
1012 rbi = dev->read; 1013 rbi = dev->read;
1013 dev->read = NULL; 1014 dev->read = NULL;
1014 while (rbi && rbi->bi_sector < 1015 while (rbi && rbi->bi_iter.bi_sector <
1015 dev->sector + STRIPE_SECTORS) { 1016 dev->sector + STRIPE_SECTORS) {
1016 rbi2 = r5_next_bio(rbi, dev->sector); 1017 rbi2 = r5_next_bio(rbi, dev->sector);
1017 if (!raid5_dec_bi_active_stripes(rbi)) { 1018 if (!raid5_dec_bi_active_stripes(rbi)) {
@@ -1047,7 +1048,7 @@ static void ops_run_biofill(struct stripe_head *sh)
1047 dev->read = rbi = dev->toread; 1048 dev->read = rbi = dev->toread;
1048 dev->toread = NULL; 1049 dev->toread = NULL;
1049 spin_unlock_irq(&sh->stripe_lock); 1050 spin_unlock_irq(&sh->stripe_lock);
1050 while (rbi && rbi->bi_sector < 1051 while (rbi && rbi->bi_iter.bi_sector <
1051 dev->sector + STRIPE_SECTORS) { 1052 dev->sector + STRIPE_SECTORS) {
1052 tx = async_copy_data(0, rbi, dev->page, 1053 tx = async_copy_data(0, rbi, dev->page,
1053 dev->sector, tx); 1054 dev->sector, tx);
@@ -1389,7 +1390,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1389 wbi = dev->written = chosen; 1390 wbi = dev->written = chosen;
1390 spin_unlock_irq(&sh->stripe_lock); 1391 spin_unlock_irq(&sh->stripe_lock);
1391 1392
1392 while (wbi && wbi->bi_sector < 1393 while (wbi && wbi->bi_iter.bi_sector <
1393 dev->sector + STRIPE_SECTORS) { 1394 dev->sector + STRIPE_SECTORS) {
1394 if (wbi->bi_rw & REQ_FUA) 1395 if (wbi->bi_rw & REQ_FUA)
1395 set_bit(R5_WantFUA, &dev->flags); 1396 set_bit(R5_WantFUA, &dev->flags);
@@ -2110,6 +2111,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
2110 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2111 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2111 } else { 2112 } else {
2112 if (!uptodate) { 2113 if (!uptodate) {
2114 set_bit(STRIPE_DEGRADED, &sh->state);
2113 set_bit(WriteErrorSeen, &rdev->flags); 2115 set_bit(WriteErrorSeen, &rdev->flags);
2114 set_bit(R5_WriteError, &sh->dev[i].flags); 2116 set_bit(R5_WriteError, &sh->dev[i].flags);
2115 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2117 if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -2613,7 +2615,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2613 int firstwrite=0; 2615 int firstwrite=0;
2614 2616
2615 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2617 pr_debug("adding bi b#%llu to stripe s#%llu\n",
2616 (unsigned long long)bi->bi_sector, 2618 (unsigned long long)bi->bi_iter.bi_sector,
2617 (unsigned long long)sh->sector); 2619 (unsigned long long)sh->sector);
2618 2620
2619 /* 2621 /*
@@ -2631,12 +2633,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2631 firstwrite = 1; 2633 firstwrite = 1;
2632 } else 2634 } else
2633 bip = &sh->dev[dd_idx].toread; 2635 bip = &sh->dev[dd_idx].toread;
2634 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2636 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
2635 if (bio_end_sector(*bip) > bi->bi_sector) 2637 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
2636 goto overlap; 2638 goto overlap;
2637 bip = & (*bip)->bi_next; 2639 bip = & (*bip)->bi_next;
2638 } 2640 }
2639 if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) 2641 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
2640 goto overlap; 2642 goto overlap;
2641 2643
2642 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2644 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2650,7 +2652,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2650 sector_t sector = sh->dev[dd_idx].sector; 2652 sector_t sector = sh->dev[dd_idx].sector;
2651 for (bi=sh->dev[dd_idx].towrite; 2653 for (bi=sh->dev[dd_idx].towrite;
2652 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2654 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2653 bi && bi->bi_sector <= sector; 2655 bi && bi->bi_iter.bi_sector <= sector;
2654 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2656 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2655 if (bio_end_sector(bi) >= sector) 2657 if (bio_end_sector(bi) >= sector)
2656 sector = bio_end_sector(bi); 2658 sector = bio_end_sector(bi);
@@ -2660,7 +2662,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2660 } 2662 }
2661 2663
2662 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2664 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2663 (unsigned long long)(*bip)->bi_sector, 2665 (unsigned long long)(*bip)->bi_iter.bi_sector,
2664 (unsigned long long)sh->sector, dd_idx); 2666 (unsigned long long)sh->sector, dd_idx);
2665 spin_unlock_irq(&sh->stripe_lock); 2667 spin_unlock_irq(&sh->stripe_lock);
2666 2668
@@ -2735,7 +2737,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2735 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2737 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2736 wake_up(&conf->wait_for_overlap); 2738 wake_up(&conf->wait_for_overlap);
2737 2739
2738 while (bi && bi->bi_sector < 2740 while (bi && bi->bi_iter.bi_sector <
2739 sh->dev[i].sector + STRIPE_SECTORS) { 2741 sh->dev[i].sector + STRIPE_SECTORS) {
2740 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2742 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2741 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2743 clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2754,7 +2756,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2754 bi = sh->dev[i].written; 2756 bi = sh->dev[i].written;
2755 sh->dev[i].written = NULL; 2757 sh->dev[i].written = NULL;
2756 if (bi) bitmap_end = 1; 2758 if (bi) bitmap_end = 1;
2757 while (bi && bi->bi_sector < 2759 while (bi && bi->bi_iter.bi_sector <
2758 sh->dev[i].sector + STRIPE_SECTORS) { 2760 sh->dev[i].sector + STRIPE_SECTORS) {
2759 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2761 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2760 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2762 clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2778,7 +2780,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2778 spin_unlock_irq(&sh->stripe_lock); 2780 spin_unlock_irq(&sh->stripe_lock);
2779 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2781 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2780 wake_up(&conf->wait_for_overlap); 2782 wake_up(&conf->wait_for_overlap);
2781 while (bi && bi->bi_sector < 2783 while (bi && bi->bi_iter.bi_sector <
2782 sh->dev[i].sector + STRIPE_SECTORS) { 2784 sh->dev[i].sector + STRIPE_SECTORS) {
2783 struct bio *nextbi = 2785 struct bio *nextbi =
2784 r5_next_bio(bi, sh->dev[i].sector); 2786 r5_next_bio(bi, sh->dev[i].sector);
@@ -3002,7 +3004,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3002 clear_bit(R5_UPTODATE, &dev->flags); 3004 clear_bit(R5_UPTODATE, &dev->flags);
3003 wbi = dev->written; 3005 wbi = dev->written;
3004 dev->written = NULL; 3006 dev->written = NULL;
3005 while (wbi && wbi->bi_sector < 3007 while (wbi && wbi->bi_iter.bi_sector <
3006 dev->sector + STRIPE_SECTORS) { 3008 dev->sector + STRIPE_SECTORS) {
3007 wbi2 = r5_next_bio(wbi, dev->sector); 3009 wbi2 = r5_next_bio(wbi, dev->sector);
3008 if (!raid5_dec_bi_active_stripes(wbi)) { 3010 if (!raid5_dec_bi_active_stripes(wbi)) {
@@ -3608,7 +3610,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3608 */ 3610 */
3609 set_bit(R5_Insync, &dev->flags); 3611 set_bit(R5_Insync, &dev->flags);
3610 3612
3611 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3613 if (test_bit(R5_WriteError, &dev->flags)) {
3612 /* This flag does not apply to '.replacement' 3614 /* This flag does not apply to '.replacement'
3613 * only to .rdev, so make sure to check that*/ 3615 * only to .rdev, so make sure to check that*/
3614 struct md_rdev *rdev2 = rcu_dereference( 3616 struct md_rdev *rdev2 = rcu_dereference(
@@ -3621,7 +3623,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3621 } else 3623 } else
3622 clear_bit(R5_WriteError, &dev->flags); 3624 clear_bit(R5_WriteError, &dev->flags);
3623 } 3625 }
3624 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3626 if (test_bit(R5_MadeGood, &dev->flags)) {
3625 /* This flag does not apply to '.replacement' 3627 /* This flag does not apply to '.replacement'
3626 * only to .rdev, so make sure to check that*/ 3628 * only to .rdev, so make sure to check that*/
3627 struct md_rdev *rdev2 = rcu_dereference( 3629 struct md_rdev *rdev2 = rcu_dereference(
@@ -4094,7 +4096,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,
4094 4096
4095static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 4097static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
4096{ 4098{
4097 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 4099 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
4098 unsigned int chunk_sectors = mddev->chunk_sectors; 4100 unsigned int chunk_sectors = mddev->chunk_sectors;
4099 unsigned int bio_sectors = bio_sectors(bio); 4101 unsigned int bio_sectors = bio_sectors(bio);
4100 4102
@@ -4231,9 +4233,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4231 /* 4233 /*
4232 * compute position 4234 * compute position
4233 */ 4235 */
4234 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 4236 align_bi->bi_iter.bi_sector =
4235 0, 4237 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
4236 &dd_idx, NULL); 4238 0, &dd_idx, NULL);
4237 4239
4238 end_sector = bio_end_sector(align_bi); 4240 end_sector = bio_end_sector(align_bi);
4239 rcu_read_lock(); 4241 rcu_read_lock();
@@ -4258,7 +4260,8 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4258 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4260 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
4259 4261
4260 if (!bio_fits_rdev(align_bi) || 4262 if (!bio_fits_rdev(align_bi) ||
4261 is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), 4263 is_badblock(rdev, align_bi->bi_iter.bi_sector,
4264 bio_sectors(align_bi),
4262 &first_bad, &bad_sectors)) { 4265 &first_bad, &bad_sectors)) {
4263 /* too big in some way, or has a known bad block */ 4266 /* too big in some way, or has a known bad block */
4264 bio_put(align_bi); 4267 bio_put(align_bi);
@@ -4267,7 +4270,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4267 } 4270 }
4268 4271
4269 /* No reshape active, so we can trust rdev->data_offset */ 4272 /* No reshape active, so we can trust rdev->data_offset */
4270 align_bi->bi_sector += rdev->data_offset; 4273 align_bi->bi_iter.bi_sector += rdev->data_offset;
4271 4274
4272 spin_lock_irq(&conf->device_lock); 4275 spin_lock_irq(&conf->device_lock);
4273 wait_event_lock_irq(conf->wait_for_stripe, 4276 wait_event_lock_irq(conf->wait_for_stripe,
@@ -4279,7 +4282,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4279 if (mddev->gendisk) 4282 if (mddev->gendisk)
4280 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4283 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
4281 align_bi, disk_devt(mddev->gendisk), 4284 align_bi, disk_devt(mddev->gendisk),
4282 raid_bio->bi_sector); 4285 raid_bio->bi_iter.bi_sector);
4283 generic_make_request(align_bi); 4286 generic_make_request(align_bi);
4284 return 1; 4287 return 1;
4285 } else { 4288 } else {
@@ -4462,8 +4465,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
4462 /* Skip discard while reshape is happening */ 4465 /* Skip discard while reshape is happening */
4463 return; 4466 return;
4464 4467
4465 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4468 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4466 last_sector = bi->bi_sector + (bi->bi_size>>9); 4469 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
4467 4470
4468 bi->bi_next = NULL; 4471 bi->bi_next = NULL;
4469 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4472 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@ -4567,7 +4570,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4567 return; 4570 return;
4568 } 4571 }
4569 4572
4570 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4573 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4571 last_sector = bio_end_sector(bi); 4574 last_sector = bio_end_sector(bi);
4572 bi->bi_next = NULL; 4575 bi->bi_next = NULL;
4573 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4576 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@ -5051,7 +5054,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
5051 int remaining; 5054 int remaining;
5052 int handled = 0; 5055 int handled = 0;
5053 5056
5054 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5057 logical_sector = raid_bio->bi_iter.bi_sector &
5058 ~((sector_t)STRIPE_SECTORS-1);
5055 sector = raid5_compute_sector(conf, logical_sector, 5059 sector = raid5_compute_sector(conf, logical_sector,
5056 0, &dd_idx, NULL); 5060 0, &dd_idx, NULL);
5057 last_sector = bio_end_sector(raid_bio); 5061 last_sector = bio_end_sector(raid_bio);
@@ -5510,23 +5514,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
5510 return sectors * (raid_disks - conf->max_degraded); 5514 return sectors * (raid_disks - conf->max_degraded);
5511} 5515}
5512 5516
5517static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
5518{
5519 safe_put_page(percpu->spare_page);
5520 kfree(percpu->scribble);
5521 percpu->spare_page = NULL;
5522 percpu->scribble = NULL;
5523}
5524
5525static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
5526{
5527 if (conf->level == 6 && !percpu->spare_page)
5528 percpu->spare_page = alloc_page(GFP_KERNEL);
5529 if (!percpu->scribble)
5530 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
5531
5532 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
5533 free_scratch_buffer(conf, percpu);
5534 return -ENOMEM;
5535 }
5536
5537 return 0;
5538}
5539
5513static void raid5_free_percpu(struct r5conf *conf) 5540static void raid5_free_percpu(struct r5conf *conf)
5514{ 5541{
5515 struct raid5_percpu *percpu;
5516 unsigned long cpu; 5542 unsigned long cpu;
5517 5543
5518 if (!conf->percpu) 5544 if (!conf->percpu)
5519 return; 5545 return;
5520 5546
5521 get_online_cpus();
5522 for_each_possible_cpu(cpu) {
5523 percpu = per_cpu_ptr(conf->percpu, cpu);
5524 safe_put_page(percpu->spare_page);
5525 kfree(percpu->scribble);
5526 }
5527#ifdef CONFIG_HOTPLUG_CPU 5547#ifdef CONFIG_HOTPLUG_CPU
5528 unregister_cpu_notifier(&conf->cpu_notify); 5548 unregister_cpu_notifier(&conf->cpu_notify);
5529#endif 5549#endif
5550
5551 get_online_cpus();
5552 for_each_possible_cpu(cpu)
5553 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
5530 put_online_cpus(); 5554 put_online_cpus();
5531 5555
5532 free_percpu(conf->percpu); 5556 free_percpu(conf->percpu);
@@ -5553,15 +5577,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
5553 switch (action) { 5577 switch (action) {
5554 case CPU_UP_PREPARE: 5578 case CPU_UP_PREPARE:
5555 case CPU_UP_PREPARE_FROZEN: 5579 case CPU_UP_PREPARE_FROZEN:
5556 if (conf->level == 6 && !percpu->spare_page) 5580 if (alloc_scratch_buffer(conf, percpu)) {
5557 percpu->spare_page = alloc_page(GFP_KERNEL);
5558 if (!percpu->scribble)
5559 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
5560
5561 if (!percpu->scribble ||
5562 (conf->level == 6 && !percpu->spare_page)) {
5563 safe_put_page(percpu->spare_page);
5564 kfree(percpu->scribble);
5565 pr_err("%s: failed memory allocation for cpu%ld\n", 5581 pr_err("%s: failed memory allocation for cpu%ld\n",
5566 __func__, cpu); 5582 __func__, cpu);
5567 return notifier_from_errno(-ENOMEM); 5583 return notifier_from_errno(-ENOMEM);
@@ -5569,10 +5585,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
5569 break; 5585 break;
5570 case CPU_DEAD: 5586 case CPU_DEAD:
5571 case CPU_DEAD_FROZEN: 5587 case CPU_DEAD_FROZEN:
5572 safe_put_page(percpu->spare_page); 5588 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
5573 kfree(percpu->scribble);
5574 percpu->spare_page = NULL;
5575 percpu->scribble = NULL;
5576 break; 5589 break;
5577 default: 5590 default:
5578 break; 5591 break;
@@ -5584,40 +5597,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
5584static int raid5_alloc_percpu(struct r5conf *conf) 5597static int raid5_alloc_percpu(struct r5conf *conf)
5585{ 5598{
5586 unsigned long cpu; 5599 unsigned long cpu;
5587 struct page *spare_page; 5600 int err = 0;
5588 struct raid5_percpu __percpu *allcpus;
5589 void *scribble;
5590 int err;
5591 5601
5592 allcpus = alloc_percpu(struct raid5_percpu); 5602 conf->percpu = alloc_percpu(struct raid5_percpu);
5593 if (!allcpus) 5603 if (!conf->percpu)
5594 return -ENOMEM; 5604 return -ENOMEM;
5595 conf->percpu = allcpus; 5605
5606#ifdef CONFIG_HOTPLUG_CPU
5607 conf->cpu_notify.notifier_call = raid456_cpu_notify;
5608 conf->cpu_notify.priority = 0;
5609 err = register_cpu_notifier(&conf->cpu_notify);
5610 if (err)
5611 return err;
5612#endif
5596 5613
5597 get_online_cpus(); 5614 get_online_cpus();
5598 err = 0;
5599 for_each_present_cpu(cpu) { 5615 for_each_present_cpu(cpu) {
5600 if (conf->level == 6) { 5616 err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
5601 spare_page = alloc_page(GFP_KERNEL); 5617 if (err) {
5602 if (!spare_page) { 5618 pr_err("%s: failed memory allocation for cpu%ld\n",
5603 err = -ENOMEM; 5619 __func__, cpu);
5604 break;
5605 }
5606 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
5607 }
5608 scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
5609 if (!scribble) {
5610 err = -ENOMEM;
5611 break; 5620 break;
5612 } 5621 }
5613 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
5614 } 5622 }
5615#ifdef CONFIG_HOTPLUG_CPU
5616 conf->cpu_notify.notifier_call = raid456_cpu_notify;
5617 conf->cpu_notify.priority = 0;
5618 if (err == 0)
5619 err = register_cpu_notifier(&conf->cpu_notify);
5620#endif
5621 put_online_cpus(); 5623 put_online_cpus();
5622 5624
5623 return err; 5625 return err;
@@ -6099,6 +6101,7 @@ static int run(struct mddev *mddev)
6099 blk_queue_io_min(mddev->queue, chunk_size); 6101 blk_queue_io_min(mddev->queue, chunk_size);
6100 blk_queue_io_opt(mddev->queue, chunk_size * 6102 blk_queue_io_opt(mddev->queue, chunk_size *
6101 (conf->raid_disks - conf->max_degraded)); 6103 (conf->raid_disks - conf->max_degraded));
6104 mddev->queue->limits.raid_partial_stripes_expensive = 1;
6102 /* 6105 /*
6103 * We can only discard a whole stripe. It doesn't make sense to 6106 * We can only discard a whole stripe. It doesn't make sense to
6104 * discard data disk but write parity disk 6107 * discard data disk but write parity disk