diff options
Diffstat (limited to 'drivers/md')
74 files changed, 4061 insertions, 3168 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index f2ccbc3b9fe4..95ad936e6048 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -176,8 +176,12 @@ config MD_FAULTY | |||
176 | 176 | ||
177 | source "drivers/md/bcache/Kconfig" | 177 | source "drivers/md/bcache/Kconfig" |
178 | 178 | ||
179 | config BLK_DEV_DM_BUILTIN | ||
180 | boolean | ||
181 | |||
179 | config BLK_DEV_DM | 182 | config BLK_DEV_DM |
180 | tristate "Device mapper support" | 183 | tristate "Device mapper support" |
184 | select BLK_DEV_DM_BUILTIN | ||
181 | ---help--- | 185 | ---help--- |
182 | Device-mapper is a low level volume manager. It works by allowing | 186 | Device-mapper is a low level volume manager. It works by allowing |
183 | people to specify mappings for ranges of logical sectors. Various | 187 | people to specify mappings for ranges of logical sectors. Various |
@@ -238,6 +242,7 @@ config DM_CRYPT | |||
238 | config DM_SNAPSHOT | 242 | config DM_SNAPSHOT |
239 | tristate "Snapshot target" | 243 | tristate "Snapshot target" |
240 | depends on BLK_DEV_DM | 244 | depends on BLK_DEV_DM |
245 | select DM_BUFIO | ||
241 | ---help--- | 246 | ---help--- |
242 | Allow volume managers to take writable snapshots of a device. | 247 | Allow volume managers to take writable snapshots of a device. |
243 | 248 | ||
@@ -249,16 +254,6 @@ config DM_THIN_PROVISIONING | |||
249 | ---help--- | 254 | ---help--- |
250 | Provides thin provisioning and snapshots that share a data store. | 255 | Provides thin provisioning and snapshots that share a data store. |
251 | 256 | ||
252 | config DM_DEBUG_BLOCK_STACK_TRACING | ||
253 | boolean "Keep stack trace of thin provisioning block lock holders" | ||
254 | depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING | ||
255 | select STACKTRACE | ||
256 | ---help--- | ||
257 | Enable this for messages that may help debug problems with the | ||
258 | block manager locking used by thin provisioning. | ||
259 | |||
260 | If unsure, say N. | ||
261 | |||
262 | config DM_CACHE | 257 | config DM_CACHE |
263 | tristate "Cache target (EXPERIMENTAL)" | 258 | tristate "Cache target (EXPERIMENTAL)" |
264 | depends on BLK_DEV_DM | 259 | depends on BLK_DEV_DM |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 2acc43fe0229..f26d83292579 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -32,6 +32,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o | |||
32 | obj-$(CONFIG_BCACHE) += bcache/ | 32 | obj-$(CONFIG_BCACHE) += bcache/ |
33 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o | 33 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o |
34 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | 34 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o |
35 | obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o | ||
35 | obj-$(CONFIG_DM_BUFIO) += dm-bufio.o | 36 | obj-$(CONFIG_DM_BUFIO) += dm-bufio.o |
36 | obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o | 37 | obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o |
37 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | 38 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o |
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 0e9c82523be6..c488b846f831 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile | |||
@@ -1,7 +1,8 @@ | |||
1 | 1 | ||
2 | obj-$(CONFIG_BCACHE) += bcache.o | 2 | obj-$(CONFIG_BCACHE) += bcache.o |
3 | 3 | ||
4 | bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ | 4 | bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ |
5 | movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o | 5 | io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ |
6 | util.o writeback.o | ||
6 | 7 | ||
7 | CFLAGS_request.o += -Iblock | 8 | CFLAGS_request.o += -Iblock |
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4c9852d92b0a..c0d37d082443 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) | |||
132 | { | 132 | { |
133 | BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); | 133 | BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); |
134 | 134 | ||
135 | if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && | 135 | if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { |
136 | CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) | 136 | unsigned i; |
137 | return false; | 137 | |
138 | for (i = 0; i < RESERVE_NONE; i++) | ||
139 | if (!fifo_full(&ca->free[i])) | ||
140 | goto add; | ||
138 | 141 | ||
142 | return false; | ||
143 | } | ||
144 | add: | ||
139 | b->prio = 0; | 145 | b->prio = 0; |
140 | 146 | ||
141 | if (can_inc_bucket_gen(b) && | 147 | if (can_inc_bucket_gen(b) && |
@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b) | |||
162 | fifo_push(&ca->free_inc, b - ca->buckets); | 168 | fifo_push(&ca->free_inc, b - ca->buckets); |
163 | } | 169 | } |
164 | 170 | ||
165 | #define bucket_prio(b) \ | 171 | /* |
166 | (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) | 172 | * Determines what order we're going to reuse buckets, smallest bucket_prio() |
173 | * first: we also take into account the number of sectors of live data in that | ||
174 | * bucket, and in order for that multiply to make sense we have to scale bucket | ||
175 | * | ||
176 | * Thus, we scale the bucket priorities so that the bucket with the smallest | ||
177 | * prio is worth 1/8th of what INITIAL_PRIO is worth. | ||
178 | */ | ||
179 | |||
180 | #define bucket_prio(b) \ | ||
181 | ({ \ | ||
182 | unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ | ||
183 | \ | ||
184 | (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ | ||
185 | }) | ||
167 | 186 | ||
168 | #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) | 187 | #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) |
169 | #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) | 188 | #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) |
@@ -304,6 +323,21 @@ do { \ | |||
304 | __set_current_state(TASK_RUNNING); \ | 323 | __set_current_state(TASK_RUNNING); \ |
305 | } while (0) | 324 | } while (0) |
306 | 325 | ||
326 | static int bch_allocator_push(struct cache *ca, long bucket) | ||
327 | { | ||
328 | unsigned i; | ||
329 | |||
330 | /* Prios/gens are actually the most important reserve */ | ||
331 | if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) | ||
332 | return true; | ||
333 | |||
334 | for (i = 0; i < RESERVE_NR; i++) | ||
335 | if (fifo_push(&ca->free[i], bucket)) | ||
336 | return true; | ||
337 | |||
338 | return false; | ||
339 | } | ||
340 | |||
307 | static int bch_allocator_thread(void *arg) | 341 | static int bch_allocator_thread(void *arg) |
308 | { | 342 | { |
309 | struct cache *ca = arg; | 343 | struct cache *ca = arg; |
@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg) | |||
336 | mutex_lock(&ca->set->bucket_lock); | 370 | mutex_lock(&ca->set->bucket_lock); |
337 | } | 371 | } |
338 | 372 | ||
339 | allocator_wait(ca, !fifo_full(&ca->free)); | 373 | allocator_wait(ca, bch_allocator_push(ca, bucket)); |
340 | |||
341 | fifo_push(&ca->free, bucket); | ||
342 | wake_up(&ca->set->bucket_wait); | 374 | wake_up(&ca->set->bucket_wait); |
343 | } | 375 | } |
344 | 376 | ||
@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg) | |||
365 | } | 397 | } |
366 | } | 398 | } |
367 | 399 | ||
368 | long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) | 400 | long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) |
369 | { | 401 | { |
370 | DEFINE_WAIT(w); | 402 | DEFINE_WAIT(w); |
371 | struct bucket *b; | 403 | struct bucket *b; |
372 | long r; | 404 | long r; |
373 | 405 | ||
374 | /* fastpath */ | 406 | /* fastpath */ |
375 | if (fifo_used(&ca->free) > ca->watermark[watermark]) { | 407 | if (fifo_pop(&ca->free[RESERVE_NONE], r) || |
376 | fifo_pop(&ca->free, r); | 408 | fifo_pop(&ca->free[reserve], r)) |
377 | goto out; | 409 | goto out; |
378 | } | ||
379 | 410 | ||
380 | if (!wait) | 411 | if (!wait) |
381 | return -1; | 412 | return -1; |
382 | 413 | ||
383 | while (1) { | 414 | do { |
384 | if (fifo_used(&ca->free) > ca->watermark[watermark]) { | ||
385 | fifo_pop(&ca->free, r); | ||
386 | break; | ||
387 | } | ||
388 | |||
389 | prepare_to_wait(&ca->set->bucket_wait, &w, | 415 | prepare_to_wait(&ca->set->bucket_wait, &w, |
390 | TASK_UNINTERRUPTIBLE); | 416 | TASK_UNINTERRUPTIBLE); |
391 | 417 | ||
392 | mutex_unlock(&ca->set->bucket_lock); | 418 | mutex_unlock(&ca->set->bucket_lock); |
393 | schedule(); | 419 | schedule(); |
394 | mutex_lock(&ca->set->bucket_lock); | 420 | mutex_lock(&ca->set->bucket_lock); |
395 | } | 421 | } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && |
422 | !fifo_pop(&ca->free[reserve], r)); | ||
396 | 423 | ||
397 | finish_wait(&ca->set->bucket_wait, &w); | 424 | finish_wait(&ca->set->bucket_wait, &w); |
398 | out: | 425 | out: |
@@ -401,12 +428,14 @@ out: | |||
401 | if (expensive_debug_checks(ca->set)) { | 428 | if (expensive_debug_checks(ca->set)) { |
402 | size_t iter; | 429 | size_t iter; |
403 | long i; | 430 | long i; |
431 | unsigned j; | ||
404 | 432 | ||
405 | for (iter = 0; iter < prio_buckets(ca) * 2; iter++) | 433 | for (iter = 0; iter < prio_buckets(ca) * 2; iter++) |
406 | BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); | 434 | BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); |
407 | 435 | ||
408 | fifo_for_each(i, &ca->free, iter) | 436 | for (j = 0; j < RESERVE_NR; j++) |
409 | BUG_ON(i == r); | 437 | fifo_for_each(i, &ca->free[j], iter) |
438 | BUG_ON(i == r); | ||
410 | fifo_for_each(i, &ca->free_inc, iter) | 439 | fifo_for_each(i, &ca->free_inc, iter) |
411 | BUG_ON(i == r); | 440 | BUG_ON(i == r); |
412 | fifo_for_each(i, &ca->unused, iter) | 441 | fifo_for_each(i, &ca->unused, iter) |
@@ -419,7 +448,7 @@ out: | |||
419 | 448 | ||
420 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); | 449 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); |
421 | 450 | ||
422 | if (watermark <= WATERMARK_METADATA) { | 451 | if (reserve <= RESERVE_PRIO) { |
423 | SET_GC_MARK(b, GC_MARK_METADATA); | 452 | SET_GC_MARK(b, GC_MARK_METADATA); |
424 | SET_GC_MOVE(b, 0); | 453 | SET_GC_MOVE(b, 0); |
425 | b->prio = BTREE_PRIO; | 454 | b->prio = BTREE_PRIO; |
@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) | |||
445 | } | 474 | } |
446 | } | 475 | } |
447 | 476 | ||
448 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | 477 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, |
449 | struct bkey *k, int n, bool wait) | 478 | struct bkey *k, int n, bool wait) |
450 | { | 479 | { |
451 | int i; | 480 | int i; |
@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | |||
459 | 488 | ||
460 | for (i = 0; i < n; i++) { | 489 | for (i = 0; i < n; i++) { |
461 | struct cache *ca = c->cache_by_alloc[i]; | 490 | struct cache *ca = c->cache_by_alloc[i]; |
462 | long b = bch_bucket_alloc(ca, watermark, wait); | 491 | long b = bch_bucket_alloc(ca, reserve, wait); |
463 | 492 | ||
464 | if (b == -1) | 493 | if (b == -1) |
465 | goto err; | 494 | goto err; |
@@ -478,12 +507,12 @@ err: | |||
478 | return -1; | 507 | return -1; |
479 | } | 508 | } |
480 | 509 | ||
481 | int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | 510 | int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, |
482 | struct bkey *k, int n, bool wait) | 511 | struct bkey *k, int n, bool wait) |
483 | { | 512 | { |
484 | int ret; | 513 | int ret; |
485 | mutex_lock(&c->bucket_lock); | 514 | mutex_lock(&c->bucket_lock); |
486 | ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); | 515 | ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); |
487 | mutex_unlock(&c->bucket_lock); | 516 | mutex_unlock(&c->bucket_lock); |
488 | return ret; | 517 | return ret; |
489 | } | 518 | } |
@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, | |||
573 | 602 | ||
574 | while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { | 603 | while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { |
575 | unsigned watermark = write_prio | 604 | unsigned watermark = write_prio |
576 | ? WATERMARK_MOVINGGC | 605 | ? RESERVE_MOVINGGC |
577 | : WATERMARK_NONE; | 606 | : RESERVE_NONE; |
578 | 607 | ||
579 | spin_unlock(&c->data_bucket_lock); | 608 | spin_unlock(&c->data_bucket_lock); |
580 | 609 | ||
@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca) | |||
689 | * Then 8 for btree allocations | 718 | * Then 8 for btree allocations |
690 | * Then half for the moving garbage collector | 719 | * Then half for the moving garbage collector |
691 | */ | 720 | */ |
692 | 721 | #if 0 | |
693 | ca->watermark[WATERMARK_PRIO] = 0; | 722 | ca->watermark[WATERMARK_PRIO] = 0; |
694 | 723 | ||
695 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | 724 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); |
@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca) | |||
699 | 728 | ||
700 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | 729 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + |
701 | ca->watermark[WATERMARK_MOVINGGC]; | 730 | ca->watermark[WATERMARK_MOVINGGC]; |
702 | 731 | #endif | |
703 | return 0; | 732 | return 0; |
704 | } | 733 | } |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 754f43177483..a4c7306ff43d 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -187,6 +187,7 @@ | |||
187 | #include <linux/types.h> | 187 | #include <linux/types.h> |
188 | #include <linux/workqueue.h> | 188 | #include <linux/workqueue.h> |
189 | 189 | ||
190 | #include "bset.h" | ||
190 | #include "util.h" | 191 | #include "util.h" |
191 | #include "closure.h" | 192 | #include "closure.h" |
192 | 193 | ||
@@ -209,7 +210,9 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); | |||
209 | #define GC_MARK_RECLAIMABLE 0 | 210 | #define GC_MARK_RECLAIMABLE 0 |
210 | #define GC_MARK_DIRTY 1 | 211 | #define GC_MARK_DIRTY 1 |
211 | #define GC_MARK_METADATA 2 | 212 | #define GC_MARK_METADATA 2 |
212 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); | 213 | #define GC_SECTORS_USED_SIZE 13 |
214 | #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) | ||
215 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); | ||
213 | BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); | 216 | BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); |
214 | 217 | ||
215 | #include "journal.h" | 218 | #include "journal.h" |
@@ -280,7 +283,6 @@ struct bcache_device { | |||
280 | unsigned long sectors_dirty_last; | 283 | unsigned long sectors_dirty_last; |
281 | long sectors_dirty_derivative; | 284 | long sectors_dirty_derivative; |
282 | 285 | ||
283 | mempool_t *unaligned_bvec; | ||
284 | struct bio_set *bio_split; | 286 | struct bio_set *bio_split; |
285 | 287 | ||
286 | unsigned data_csum:1; | 288 | unsigned data_csum:1; |
@@ -310,7 +312,8 @@ struct cached_dev { | |||
310 | struct cache_sb sb; | 312 | struct cache_sb sb; |
311 | struct bio sb_bio; | 313 | struct bio sb_bio; |
312 | struct bio_vec sb_bv[1]; | 314 | struct bio_vec sb_bv[1]; |
313 | struct closure_with_waitlist sb_write; | 315 | struct closure sb_write; |
316 | struct semaphore sb_write_mutex; | ||
314 | 317 | ||
315 | /* Refcount on the cache set. Always nonzero when we're caching. */ | 318 | /* Refcount on the cache set. Always nonzero when we're caching. */ |
316 | atomic_t count; | 319 | atomic_t count; |
@@ -383,12 +386,12 @@ struct cached_dev { | |||
383 | unsigned writeback_rate_p_term_inverse; | 386 | unsigned writeback_rate_p_term_inverse; |
384 | }; | 387 | }; |
385 | 388 | ||
386 | enum alloc_watermarks { | 389 | enum alloc_reserve { |
387 | WATERMARK_PRIO, | 390 | RESERVE_BTREE, |
388 | WATERMARK_METADATA, | 391 | RESERVE_PRIO, |
389 | WATERMARK_MOVINGGC, | 392 | RESERVE_MOVINGGC, |
390 | WATERMARK_NONE, | 393 | RESERVE_NONE, |
391 | WATERMARK_MAX | 394 | RESERVE_NR, |
392 | }; | 395 | }; |
393 | 396 | ||
394 | struct cache { | 397 | struct cache { |
@@ -400,8 +403,6 @@ struct cache { | |||
400 | struct kobject kobj; | 403 | struct kobject kobj; |
401 | struct block_device *bdev; | 404 | struct block_device *bdev; |
402 | 405 | ||
403 | unsigned watermark[WATERMARK_MAX]; | ||
404 | |||
405 | struct task_struct *alloc_thread; | 406 | struct task_struct *alloc_thread; |
406 | 407 | ||
407 | struct closure prio; | 408 | struct closure prio; |
@@ -430,7 +431,7 @@ struct cache { | |||
430 | * because all the data they contained was overwritten), so we only | 431 | * because all the data they contained was overwritten), so we only |
431 | * need to discard them before they can be moved to the free list. | 432 | * need to discard them before they can be moved to the free list. |
432 | */ | 433 | */ |
433 | DECLARE_FIFO(long, free); | 434 | DECLARE_FIFO(long, free)[RESERVE_NR]; |
434 | DECLARE_FIFO(long, free_inc); | 435 | DECLARE_FIFO(long, free_inc); |
435 | DECLARE_FIFO(long, unused); | 436 | DECLARE_FIFO(long, unused); |
436 | 437 | ||
@@ -515,7 +516,8 @@ struct cache_set { | |||
515 | uint64_t cached_dev_sectors; | 516 | uint64_t cached_dev_sectors; |
516 | struct closure caching; | 517 | struct closure caching; |
517 | 518 | ||
518 | struct closure_with_waitlist sb_write; | 519 | struct closure sb_write; |
520 | struct semaphore sb_write_mutex; | ||
519 | 521 | ||
520 | mempool_t *search; | 522 | mempool_t *search; |
521 | mempool_t *bio_meta; | 523 | mempool_t *bio_meta; |
@@ -630,13 +632,15 @@ struct cache_set { | |||
630 | 632 | ||
631 | #ifdef CONFIG_BCACHE_DEBUG | 633 | #ifdef CONFIG_BCACHE_DEBUG |
632 | struct btree *verify_data; | 634 | struct btree *verify_data; |
635 | struct bset *verify_ondisk; | ||
633 | struct mutex verify_lock; | 636 | struct mutex verify_lock; |
634 | #endif | 637 | #endif |
635 | 638 | ||
636 | unsigned nr_uuids; | 639 | unsigned nr_uuids; |
637 | struct uuid_entry *uuids; | 640 | struct uuid_entry *uuids; |
638 | BKEY_PADDED(uuid_bucket); | 641 | BKEY_PADDED(uuid_bucket); |
639 | struct closure_with_waitlist uuid_write; | 642 | struct closure uuid_write; |
643 | struct semaphore uuid_write_mutex; | ||
640 | 644 | ||
641 | /* | 645 | /* |
642 | * A btree node on disk could have too many bsets for an iterator to fit | 646 | * A btree node on disk could have too many bsets for an iterator to fit |
@@ -644,13 +648,7 @@ struct cache_set { | |||
644 | */ | 648 | */ |
645 | mempool_t *fill_iter; | 649 | mempool_t *fill_iter; |
646 | 650 | ||
647 | /* | 651 | struct bset_sort_state sort; |
648 | * btree_sort() is a merge sort and requires temporary space - single | ||
649 | * element mempool | ||
650 | */ | ||
651 | struct mutex sort_lock; | ||
652 | struct bset *sort; | ||
653 | unsigned sort_crit_factor; | ||
654 | 652 | ||
655 | /* List of buckets we're currently writing data to */ | 653 | /* List of buckets we're currently writing data to */ |
656 | struct list_head data_buckets; | 654 | struct list_head data_buckets; |
@@ -666,7 +664,6 @@ struct cache_set { | |||
666 | unsigned congested_read_threshold_us; | 664 | unsigned congested_read_threshold_us; |
667 | unsigned congested_write_threshold_us; | 665 | unsigned congested_write_threshold_us; |
668 | 666 | ||
669 | struct time_stats sort_time; | ||
670 | struct time_stats btree_gc_time; | 667 | struct time_stats btree_gc_time; |
671 | struct time_stats btree_split_time; | 668 | struct time_stats btree_split_time; |
672 | struct time_stats btree_read_time; | 669 | struct time_stats btree_read_time; |
@@ -684,9 +681,9 @@ struct cache_set { | |||
684 | unsigned error_decay; | 681 | unsigned error_decay; |
685 | 682 | ||
686 | unsigned short journal_delay_ms; | 683 | unsigned short journal_delay_ms; |
684 | bool expensive_debug_checks; | ||
687 | unsigned verify:1; | 685 | unsigned verify:1; |
688 | unsigned key_merging_disabled:1; | 686 | unsigned key_merging_disabled:1; |
689 | unsigned expensive_debug_checks:1; | ||
690 | unsigned gc_always_rewrite:1; | 687 | unsigned gc_always_rewrite:1; |
691 | unsigned shrinker_disabled:1; | 688 | unsigned shrinker_disabled:1; |
692 | unsigned copy_gc_enabled:1; | 689 | unsigned copy_gc_enabled:1; |
@@ -708,13 +705,8 @@ struct bbio { | |||
708 | struct bio bio; | 705 | struct bio bio; |
709 | }; | 706 | }; |
710 | 707 | ||
711 | static inline unsigned local_clock_us(void) | ||
712 | { | ||
713 | return local_clock() >> 10; | ||
714 | } | ||
715 | |||
716 | #define BTREE_PRIO USHRT_MAX | 708 | #define BTREE_PRIO USHRT_MAX |
717 | #define INITIAL_PRIO 32768 | 709 | #define INITIAL_PRIO 32768U |
718 | 710 | ||
719 | #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) | 711 | #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) |
720 | #define btree_blocks(b) \ | 712 | #define btree_blocks(b) \ |
@@ -727,21 +719,6 @@ static inline unsigned local_clock_us(void) | |||
727 | #define bucket_bytes(c) ((c)->sb.bucket_size << 9) | 719 | #define bucket_bytes(c) ((c)->sb.bucket_size << 9) |
728 | #define block_bytes(c) ((c)->sb.block_size << 9) | 720 | #define block_bytes(c) ((c)->sb.block_size << 9) |
729 | 721 | ||
730 | #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) | ||
731 | #define set_bytes(i) __set_bytes(i, i->keys) | ||
732 | |||
733 | #define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) | ||
734 | #define set_blocks(i, c) __set_blocks(i, (i)->keys, c) | ||
735 | |||
736 | #define node(i, j) ((struct bkey *) ((i)->d + (j))) | ||
737 | #define end(i) node(i, (i)->keys) | ||
738 | |||
739 | #define index(i, b) \ | ||
740 | ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ | ||
741 | block_bytes(b->c))) | ||
742 | |||
743 | #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) | ||
744 | |||
745 | #define prios_per_bucket(c) \ | 722 | #define prios_per_bucket(c) \ |
746 | ((bucket_bytes(c) - sizeof(struct prio_set)) / \ | 723 | ((bucket_bytes(c) - sizeof(struct prio_set)) / \ |
747 | sizeof(struct bucket_disk)) | 724 | sizeof(struct bucket_disk)) |
@@ -784,20 +761,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, | |||
784 | return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); | 761 | return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); |
785 | } | 762 | } |
786 | 763 | ||
787 | /* Btree key macros */ | 764 | static inline uint8_t gen_after(uint8_t a, uint8_t b) |
765 | { | ||
766 | uint8_t r = a - b; | ||
767 | return r > 128U ? 0 : r; | ||
768 | } | ||
788 | 769 | ||
789 | static inline void bkey_init(struct bkey *k) | 770 | static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, |
771 | unsigned i) | ||
790 | { | 772 | { |
791 | *k = ZERO_KEY; | 773 | return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); |
792 | } | 774 | } |
793 | 775 | ||
776 | static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||
777 | unsigned i) | ||
778 | { | ||
779 | return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||
780 | } | ||
781 | |||
782 | /* Btree key macros */ | ||
783 | |||
794 | /* | 784 | /* |
795 | * This is used for various on disk data structures - cache_sb, prio_set, bset, | 785 | * This is used for various on disk data structures - cache_sb, prio_set, bset, |
796 | * jset: The checksum is _always_ the first 8 bytes of these structs | 786 | * jset: The checksum is _always_ the first 8 bytes of these structs |
797 | */ | 787 | */ |
798 | #define csum_set(i) \ | 788 | #define csum_set(i) \ |
799 | bch_crc64(((void *) (i)) + sizeof(uint64_t), \ | 789 | bch_crc64(((void *) (i)) + sizeof(uint64_t), \ |
800 | ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) | 790 | ((void *) bset_bkey_last(i)) - \ |
791 | (((void *) (i)) + sizeof(uint64_t))) | ||
801 | 792 | ||
802 | /* Error handling macros */ | 793 | /* Error handling macros */ |
803 | 794 | ||
@@ -902,7 +893,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); | |||
902 | void bch_bbio_free(struct bio *, struct cache_set *); | 893 | void bch_bbio_free(struct bio *, struct cache_set *); |
903 | struct bio *bch_bbio_alloc(struct cache_set *); | 894 | struct bio *bch_bbio_alloc(struct cache_set *); |
904 | 895 | ||
905 | struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *); | ||
906 | void bch_generic_make_request(struct bio *, struct bio_split_pool *); | 896 | void bch_generic_make_request(struct bio *, struct bio_split_pool *); |
907 | void __bch_submit_bbio(struct bio *, struct cache_set *); | 897 | void __bch_submit_bbio(struct bio *, struct cache_set *); |
908 | void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); | 898 | void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); |
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 7d388b8bb50e..3f74b4b0747b 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c | |||
@@ -5,30 +5,134 @@ | |||
5 | * Copyright 2012 Google, Inc. | 5 | * Copyright 2012 Google, Inc. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "bcache.h" | 8 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ |
9 | #include "btree.h" | ||
10 | #include "debug.h" | ||
11 | 9 | ||
10 | #include "util.h" | ||
11 | #include "bset.h" | ||
12 | |||
13 | #include <linux/console.h> | ||
12 | #include <linux/random.h> | 14 | #include <linux/random.h> |
13 | #include <linux/prefetch.h> | 15 | #include <linux/prefetch.h> |
14 | 16 | ||
17 | #ifdef CONFIG_BCACHE_DEBUG | ||
18 | |||
19 | void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) | ||
20 | { | ||
21 | struct bkey *k, *next; | ||
22 | |||
23 | for (k = i->start; k < bset_bkey_last(i); k = next) { | ||
24 | next = bkey_next(k); | ||
25 | |||
26 | printk(KERN_ERR "block %u key %li/%u: ", set, | ||
27 | (uint64_t *) k - i->d, i->keys); | ||
28 | |||
29 | if (b->ops->key_dump) | ||
30 | b->ops->key_dump(b, k); | ||
31 | else | ||
32 | printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); | ||
33 | |||
34 | if (next < bset_bkey_last(i) && | ||
35 | bkey_cmp(k, b->ops->is_extents ? | ||
36 | &START_KEY(next) : next) > 0) | ||
37 | printk(KERN_ERR "Key skipped backwards\n"); | ||
38 | } | ||
39 | } | ||
40 | |||
41 | void bch_dump_bucket(struct btree_keys *b) | ||
42 | { | ||
43 | unsigned i; | ||
44 | |||
45 | console_lock(); | ||
46 | for (i = 0; i <= b->nsets; i++) | ||
47 | bch_dump_bset(b, b->set[i].data, | ||
48 | bset_sector_offset(b, b->set[i].data)); | ||
49 | console_unlock(); | ||
50 | } | ||
51 | |||
52 | int __bch_count_data(struct btree_keys *b) | ||
53 | { | ||
54 | unsigned ret = 0; | ||
55 | struct btree_iter iter; | ||
56 | struct bkey *k; | ||
57 | |||
58 | if (b->ops->is_extents) | ||
59 | for_each_key(b, k, &iter) | ||
60 | ret += KEY_SIZE(k); | ||
61 | return ret; | ||
62 | } | ||
63 | |||
64 | void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) | ||
65 | { | ||
66 | va_list args; | ||
67 | struct bkey *k, *p = NULL; | ||
68 | struct btree_iter iter; | ||
69 | const char *err; | ||
70 | |||
71 | for_each_key(b, k, &iter) { | ||
72 | if (b->ops->is_extents) { | ||
73 | err = "Keys out of order"; | ||
74 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) | ||
75 | goto bug; | ||
76 | |||
77 | if (bch_ptr_invalid(b, k)) | ||
78 | continue; | ||
79 | |||
80 | err = "Overlapping keys"; | ||
81 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) | ||
82 | goto bug; | ||
83 | } else { | ||
84 | if (bch_ptr_bad(b, k)) | ||
85 | continue; | ||
86 | |||
87 | err = "Duplicate keys"; | ||
88 | if (p && !bkey_cmp(p, k)) | ||
89 | goto bug; | ||
90 | } | ||
91 | p = k; | ||
92 | } | ||
93 | #if 0 | ||
94 | err = "Key larger than btree node key"; | ||
95 | if (p && bkey_cmp(p, &b->key) > 0) | ||
96 | goto bug; | ||
97 | #endif | ||
98 | return; | ||
99 | bug: | ||
100 | bch_dump_bucket(b); | ||
101 | |||
102 | va_start(args, fmt); | ||
103 | vprintk(fmt, args); | ||
104 | va_end(args); | ||
105 | |||
106 | panic("bch_check_keys error: %s:\n", err); | ||
107 | } | ||
108 | |||
109 | static void bch_btree_iter_next_check(struct btree_iter *iter) | ||
110 | { | ||
111 | struct bkey *k = iter->data->k, *next = bkey_next(k); | ||
112 | |||
113 | if (next < iter->data->end && | ||
114 | bkey_cmp(k, iter->b->ops->is_extents ? | ||
115 | &START_KEY(next) : next) > 0) { | ||
116 | bch_dump_bucket(iter->b); | ||
117 | panic("Key skipped backwards\n"); | ||
118 | } | ||
119 | } | ||
120 | |||
121 | #else | ||
122 | |||
123 | static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} | ||
124 | |||
125 | #endif | ||
126 | |||
15 | /* Keylists */ | 127 | /* Keylists */ |
16 | 128 | ||
17 | int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) | 129 | int __bch_keylist_realloc(struct keylist *l, unsigned u64s) |
18 | { | 130 | { |
19 | size_t oldsize = bch_keylist_nkeys(l); | 131 | size_t oldsize = bch_keylist_nkeys(l); |
20 | size_t newsize = oldsize + 2 + nptrs; | 132 | size_t newsize = oldsize + u64s; |
21 | uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; | 133 | uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; |
22 | uint64_t *new_keys; | 134 | uint64_t *new_keys; |
23 | 135 | ||
24 | /* The journalling code doesn't handle the case where the keys to insert | ||
25 | * is bigger than an empty write: If we just return -ENOMEM here, | ||
26 | * bio_insert() and bio_invalidate() will insert the keys created so far | ||
27 | * and finish the rest when the keylist is empty. | ||
28 | */ | ||
29 | if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) | ||
30 | return -ENOMEM; | ||
31 | |||
32 | newsize = roundup_pow_of_two(newsize); | 136 | newsize = roundup_pow_of_two(newsize); |
33 | 137 | ||
34 | if (newsize <= KEYLIST_INLINE || | 138 | if (newsize <= KEYLIST_INLINE || |
@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l) | |||
71 | bch_keylist_bytes(l)); | 175 | bch_keylist_bytes(l)); |
72 | } | 176 | } |
73 | 177 | ||
74 | /* Pointer validation */ | ||
75 | |||
76 | static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
77 | { | ||
78 | unsigned i; | ||
79 | |||
80 | for (i = 0; i < KEY_PTRS(k); i++) | ||
81 | if (ptr_available(c, k, i)) { | ||
82 | struct cache *ca = PTR_CACHE(c, k, i); | ||
83 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
84 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
85 | |||
86 | if (KEY_SIZE(k) + r > c->sb.bucket_size || | ||
87 | bucket < ca->sb.first_bucket || | ||
88 | bucket >= ca->sb.nbuckets) | ||
89 | return true; | ||
90 | } | ||
91 | |||
92 | return false; | ||
93 | } | ||
94 | |||
95 | bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
96 | { | ||
97 | char buf[80]; | ||
98 | |||
99 | if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) | ||
100 | goto bad; | ||
101 | |||
102 | if (__ptr_invalid(c, k)) | ||
103 | goto bad; | ||
104 | |||
105 | return false; | ||
106 | bad: | ||
107 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
108 | cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); | ||
109 | return true; | ||
110 | } | ||
111 | |||
112 | bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
113 | { | ||
114 | char buf[80]; | ||
115 | |||
116 | if (!KEY_SIZE(k)) | ||
117 | return true; | ||
118 | |||
119 | if (KEY_SIZE(k) > KEY_OFFSET(k)) | ||
120 | goto bad; | ||
121 | |||
122 | if (__ptr_invalid(c, k)) | ||
123 | goto bad; | ||
124 | |||
125 | return false; | ||
126 | bad: | ||
127 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
128 | cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); | ||
129 | return true; | ||
130 | } | ||
131 | |||
132 | static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, | ||
133 | unsigned ptr) | ||
134 | { | ||
135 | struct bucket *g = PTR_BUCKET(b->c, k, ptr); | ||
136 | char buf[80]; | ||
137 | |||
138 | if (mutex_trylock(&b->c->bucket_lock)) { | ||
139 | if (b->level) { | ||
140 | if (KEY_DIRTY(k) || | ||
141 | g->prio != BTREE_PRIO || | ||
142 | (b->c->gc_mark_valid && | ||
143 | GC_MARK(g) != GC_MARK_METADATA)) | ||
144 | goto err; | ||
145 | |||
146 | } else { | ||
147 | if (g->prio == BTREE_PRIO) | ||
148 | goto err; | ||
149 | |||
150 | if (KEY_DIRTY(k) && | ||
151 | b->c->gc_mark_valid && | ||
152 | GC_MARK(g) != GC_MARK_DIRTY) | ||
153 | goto err; | ||
154 | } | ||
155 | mutex_unlock(&b->c->bucket_lock); | ||
156 | } | ||
157 | |||
158 | return false; | ||
159 | err: | ||
160 | mutex_unlock(&b->c->bucket_lock); | ||
161 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
162 | btree_bug(b, | ||
163 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
164 | buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), | ||
165 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
166 | return true; | ||
167 | } | ||
168 | |||
169 | bool bch_ptr_bad(struct btree *b, const struct bkey *k) | ||
170 | { | ||
171 | struct bucket *g; | ||
172 | unsigned i, stale; | ||
173 | |||
174 | if (!bkey_cmp(k, &ZERO_KEY) || | ||
175 | !KEY_PTRS(k) || | ||
176 | bch_ptr_invalid(b, k)) | ||
177 | return true; | ||
178 | |||
179 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
180 | if (!ptr_available(b->c, k, i)) | ||
181 | return true; | ||
182 | |||
183 | g = PTR_BUCKET(b->c, k, i); | ||
184 | stale = ptr_stale(b->c, k, i); | ||
185 | |||
186 | btree_bug_on(stale > 96, b, | ||
187 | "key too stale: %i, need_gc %u", | ||
188 | stale, b->c->need_gc); | ||
189 | |||
190 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | ||
191 | b, "stale dirty pointer"); | ||
192 | |||
193 | if (stale) | ||
194 | return true; | ||
195 | |||
196 | if (expensive_debug_checks(b->c) && | ||
197 | ptr_bad_expensive_checks(b, k, i)) | ||
198 | return true; | ||
199 | } | ||
200 | |||
201 | return false; | ||
202 | } | ||
203 | |||
204 | /* Key/pointer manipulation */ | 178 | /* Key/pointer manipulation */ |
205 | 179 | ||
206 | void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, | 180 | void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, |
@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) | |||
255 | return true; | 229 | return true; |
256 | } | 230 | } |
257 | 231 | ||
258 | static uint64_t merge_chksums(struct bkey *l, struct bkey *r) | 232 | /* Auxiliary search trees */ |
233 | |||
234 | /* 32 bits total: */ | ||
235 | #define BKEY_MID_BITS 3 | ||
236 | #define BKEY_EXPONENT_BITS 7 | ||
237 | #define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) | ||
238 | #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) | ||
239 | |||
240 | struct bkey_float { | ||
241 | unsigned exponent:BKEY_EXPONENT_BITS; | ||
242 | unsigned m:BKEY_MID_BITS; | ||
243 | unsigned mantissa:BKEY_MANTISSA_BITS; | ||
244 | } __packed; | ||
245 | |||
246 | /* | ||
247 | * BSET_CACHELINE was originally intended to match the hardware cacheline size - | ||
248 | * it used to be 64, but I realized the lookup code would touch slightly less | ||
249 | * memory if it was 128. | ||
250 | * | ||
251 | * It definites the number of bytes (in struct bset) per struct bkey_float in | ||
252 | * the auxiliar search tree - when we're done searching the bset_float tree we | ||
253 | * have this many bytes left that we do a linear search over. | ||
254 | * | ||
255 | * Since (after level 5) every level of the bset_tree is on a new cacheline, | ||
256 | * we're touching one fewer cacheline in the bset tree in exchange for one more | ||
257 | * cacheline in the linear search - but the linear search might stop before it | ||
258 | * gets to the second cacheline. | ||
259 | */ | ||
260 | |||
261 | #define BSET_CACHELINE 128 | ||
262 | |||
263 | /* Space required for the btree node keys */ | ||
264 | static inline size_t btree_keys_bytes(struct btree_keys *b) | ||
259 | { | 265 | { |
260 | return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & | 266 | return PAGE_SIZE << b->page_order; |
261 | ~((uint64_t)1 << 63); | ||
262 | } | 267 | } |
263 | 268 | ||
264 | /* Tries to merge l and r: l should be lower than r | 269 | static inline size_t btree_keys_cachelines(struct btree_keys *b) |
265 | * Returns true if we were able to merge. If we did merge, l will be the merged | ||
266 | * key, r will be untouched. | ||
267 | */ | ||
268 | bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) | ||
269 | { | 270 | { |
270 | unsigned i; | 271 | return btree_keys_bytes(b) / BSET_CACHELINE; |
272 | } | ||
271 | 273 | ||
272 | if (key_merging_disabled(b->c)) | 274 | /* Space required for the auxiliary search trees */ |
273 | return false; | 275 | static inline size_t bset_tree_bytes(struct btree_keys *b) |
276 | { | ||
277 | return btree_keys_cachelines(b) * sizeof(struct bkey_float); | ||
278 | } | ||
274 | 279 | ||
275 | if (KEY_PTRS(l) != KEY_PTRS(r) || | 280 | /* Space required for the prev pointers */ |
276 | KEY_DIRTY(l) != KEY_DIRTY(r) || | 281 | static inline size_t bset_prev_bytes(struct btree_keys *b) |
277 | bkey_cmp(l, &START_KEY(r))) | 282 | { |
278 | return false; | 283 | return btree_keys_cachelines(b) * sizeof(uint8_t); |
284 | } | ||
279 | 285 | ||
280 | for (i = 0; i < KEY_PTRS(l); i++) | 286 | /* Memory allocation */ |
281 | if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || | ||
282 | PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) | ||
283 | return false; | ||
284 | 287 | ||
285 | /* Keys with no pointers aren't restricted to one bucket and could | 288 | void bch_btree_keys_free(struct btree_keys *b) |
286 | * overflow KEY_SIZE | 289 | { |
287 | */ | 290 | struct bset_tree *t = b->set; |
288 | if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { | ||
289 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); | ||
290 | SET_KEY_SIZE(l, USHRT_MAX); | ||
291 | 291 | ||
292 | bch_cut_front(l, r); | 292 | if (bset_prev_bytes(b) < PAGE_SIZE) |
293 | return false; | 293 | kfree(t->prev); |
294 | } | 294 | else |
295 | free_pages((unsigned long) t->prev, | ||
296 | get_order(bset_prev_bytes(b))); | ||
295 | 297 | ||
296 | if (KEY_CSUM(l)) { | 298 | if (bset_tree_bytes(b) < PAGE_SIZE) |
297 | if (KEY_CSUM(r)) | 299 | kfree(t->tree); |
298 | l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); | 300 | else |
299 | else | 301 | free_pages((unsigned long) t->tree, |
300 | SET_KEY_CSUM(l, 0); | 302 | get_order(bset_tree_bytes(b))); |
301 | } | ||
302 | 303 | ||
303 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); | 304 | free_pages((unsigned long) t->data, b->page_order); |
304 | SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); | ||
305 | 305 | ||
306 | return true; | 306 | t->prev = NULL; |
307 | t->tree = NULL; | ||
308 | t->data = NULL; | ||
309 | } | ||
310 | EXPORT_SYMBOL(bch_btree_keys_free); | ||
311 | |||
312 | int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) | ||
313 | { | ||
314 | struct bset_tree *t = b->set; | ||
315 | |||
316 | BUG_ON(t->data); | ||
317 | |||
318 | b->page_order = page_order; | ||
319 | |||
320 | t->data = (void *) __get_free_pages(gfp, b->page_order); | ||
321 | if (!t->data) | ||
322 | goto err; | ||
323 | |||
324 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | ||
325 | ? kmalloc(bset_tree_bytes(b), gfp) | ||
326 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | ||
327 | if (!t->tree) | ||
328 | goto err; | ||
329 | |||
330 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | ||
331 | ? kmalloc(bset_prev_bytes(b), gfp) | ||
332 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | ||
333 | if (!t->prev) | ||
334 | goto err; | ||
335 | |||
336 | return 0; | ||
337 | err: | ||
338 | bch_btree_keys_free(b); | ||
339 | return -ENOMEM; | ||
307 | } | 340 | } |
341 | EXPORT_SYMBOL(bch_btree_keys_alloc); | ||
342 | |||
343 | void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, | ||
344 | bool *expensive_debug_checks) | ||
345 | { | ||
346 | unsigned i; | ||
347 | |||
348 | b->ops = ops; | ||
349 | b->expensive_debug_checks = expensive_debug_checks; | ||
350 | b->nsets = 0; | ||
351 | b->last_set_unwritten = 0; | ||
352 | |||
353 | /* XXX: shouldn't be needed */ | ||
354 | for (i = 0; i < MAX_BSETS; i++) | ||
355 | b->set[i].size = 0; | ||
356 | /* | ||
357 | * Second loop starts at 1 because b->keys[0]->data is the memory we | ||
358 | * allocated | ||
359 | */ | ||
360 | for (i = 1; i < MAX_BSETS; i++) | ||
361 | b->set[i].data = NULL; | ||
362 | } | ||
363 | EXPORT_SYMBOL(bch_btree_keys_init); | ||
308 | 364 | ||
309 | /* Binary tree stuff for auxiliary search trees */ | 365 | /* Binary tree stuff for auxiliary search trees */ |
310 | 366 | ||
@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) | |||
455 | return ((void *) k - (void *) t->data) / BSET_CACHELINE; | 511 | return ((void *) k - (void *) t->data) / BSET_CACHELINE; |
456 | } | 512 | } |
457 | 513 | ||
458 | static unsigned bkey_to_cacheline_offset(struct bkey *k) | 514 | static unsigned bkey_to_cacheline_offset(struct bset_tree *t, |
515 | unsigned cacheline, | ||
516 | struct bkey *k) | ||
459 | { | 517 | { |
460 | return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); | 518 | return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); |
461 | } | 519 | } |
462 | 520 | ||
463 | static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) | 521 | static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) |
@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j) | |||
504 | : tree_to_prev_bkey(t, j >> ffs(j)); | 562 | : tree_to_prev_bkey(t, j >> ffs(j)); |
505 | 563 | ||
506 | struct bkey *r = is_power_of_2(j + 1) | 564 | struct bkey *r = is_power_of_2(j + 1) |
507 | ? node(t->data, t->data->keys - bkey_u64s(&t->end)) | 565 | ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) |
508 | : tree_to_bkey(t, j >> (ffz(j) + 1)); | 566 | : tree_to_bkey(t, j >> (ffz(j) + 1)); |
509 | 567 | ||
510 | BUG_ON(m < l || m > r); | 568 | BUG_ON(m < l || m > r); |
@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j) | |||
528 | f->exponent = 127; | 586 | f->exponent = 127; |
529 | } | 587 | } |
530 | 588 | ||
531 | static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | 589 | static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) |
532 | { | 590 | { |
533 | if (t != b->sets) { | 591 | if (t != b->set) { |
534 | unsigned j = roundup(t[-1].size, | 592 | unsigned j = roundup(t[-1].size, |
535 | 64 / sizeof(struct bkey_float)); | 593 | 64 / sizeof(struct bkey_float)); |
536 | 594 | ||
@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | |||
538 | t->prev = t[-1].prev + j; | 596 | t->prev = t[-1].prev + j; |
539 | } | 597 | } |
540 | 598 | ||
541 | while (t < b->sets + MAX_BSETS) | 599 | while (t < b->set + MAX_BSETS) |
542 | t++->size = 0; | 600 | t++->size = 0; |
543 | } | 601 | } |
544 | 602 | ||
545 | static void bset_build_unwritten_tree(struct btree *b) | 603 | static void bch_bset_build_unwritten_tree(struct btree_keys *b) |
546 | { | 604 | { |
547 | struct bset_tree *t = b->sets + b->nsets; | 605 | struct bset_tree *t = bset_tree_last(b); |
606 | |||
607 | BUG_ON(b->last_set_unwritten); | ||
608 | b->last_set_unwritten = 1; | ||
548 | 609 | ||
549 | bset_alloc_tree(b, t); | 610 | bset_alloc_tree(b, t); |
550 | 611 | ||
551 | if (t->tree != b->sets->tree + bset_tree_space(b)) { | 612 | if (t->tree != b->set->tree + btree_keys_cachelines(b)) { |
552 | t->prev[0] = bkey_to_cacheline_offset(t->data->start); | 613 | t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); |
553 | t->size = 1; | 614 | t->size = 1; |
554 | } | 615 | } |
555 | } | 616 | } |
556 | 617 | ||
557 | static void bset_build_written_tree(struct btree *b) | 618 | void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) |
619 | { | ||
620 | if (i != b->set->data) { | ||
621 | b->set[++b->nsets].data = i; | ||
622 | i->seq = b->set->data->seq; | ||
623 | } else | ||
624 | get_random_bytes(&i->seq, sizeof(uint64_t)); | ||
625 | |||
626 | i->magic = magic; | ||
627 | i->version = 0; | ||
628 | i->keys = 0; | ||
629 | |||
630 | bch_bset_build_unwritten_tree(b); | ||
631 | } | ||
632 | EXPORT_SYMBOL(bch_bset_init_next); | ||
633 | |||
634 | void bch_bset_build_written_tree(struct btree_keys *b) | ||
558 | { | 635 | { |
559 | struct bset_tree *t = b->sets + b->nsets; | 636 | struct bset_tree *t = bset_tree_last(b); |
560 | struct bkey *k = t->data->start; | 637 | struct bkey *prev = NULL, *k = t->data->start; |
561 | unsigned j, cacheline = 1; | 638 | unsigned j, cacheline = 1; |
562 | 639 | ||
640 | b->last_set_unwritten = 0; | ||
641 | |||
563 | bset_alloc_tree(b, t); | 642 | bset_alloc_tree(b, t); |
564 | 643 | ||
565 | t->size = min_t(unsigned, | 644 | t->size = min_t(unsigned, |
566 | bkey_to_cacheline(t, end(t->data)), | 645 | bkey_to_cacheline(t, bset_bkey_last(t->data)), |
567 | b->sets->tree + bset_tree_space(b) - t->tree); | 646 | b->set->tree + btree_keys_cachelines(b) - t->tree); |
568 | 647 | ||
569 | if (t->size < 2) { | 648 | if (t->size < 2) { |
570 | t->size = 0; | 649 | t->size = 0; |
@@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b) | |||
577 | for (j = inorder_next(0, t->size); | 656 | for (j = inorder_next(0, t->size); |
578 | j; | 657 | j; |
579 | j = inorder_next(j, t->size)) { | 658 | j = inorder_next(j, t->size)) { |
580 | while (bkey_to_cacheline(t, k) != cacheline) | 659 | while (bkey_to_cacheline(t, k) < cacheline) |
581 | k = bkey_next(k); | 660 | prev = k, k = bkey_next(k); |
582 | 661 | ||
583 | t->prev[j] = bkey_u64s(k); | 662 | t->prev[j] = bkey_u64s(prev); |
584 | k = bkey_next(k); | 663 | t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); |
585 | cacheline++; | ||
586 | t->tree[j].m = bkey_to_cacheline_offset(k); | ||
587 | } | 664 | } |
588 | 665 | ||
589 | while (bkey_next(k) != end(t->data)) | 666 | while (bkey_next(k) != bset_bkey_last(t->data)) |
590 | k = bkey_next(k); | 667 | k = bkey_next(k); |
591 | 668 | ||
592 | t->end = *k; | 669 | t->end = *k; |
@@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b) | |||
597 | j = inorder_next(j, t->size)) | 674 | j = inorder_next(j, t->size)) |
598 | make_bfloat(t, j); | 675 | make_bfloat(t, j); |
599 | } | 676 | } |
677 | EXPORT_SYMBOL(bch_bset_build_written_tree); | ||
600 | 678 | ||
601 | void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) | 679 | /* Insert */ |
680 | |||
681 | void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) | ||
602 | { | 682 | { |
603 | struct bset_tree *t; | 683 | struct bset_tree *t; |
604 | unsigned inorder, j = 1; | 684 | unsigned inorder, j = 1; |
605 | 685 | ||
606 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 686 | for (t = b->set; t <= bset_tree_last(b); t++) |
607 | if (k < end(t->data)) | 687 | if (k < bset_bkey_last(t->data)) |
608 | goto found_set; | 688 | goto found_set; |
609 | 689 | ||
610 | BUG(); | 690 | BUG(); |
@@ -617,7 +697,7 @@ found_set: | |||
617 | if (k == t->data->start) | 697 | if (k == t->data->start) |
618 | goto fix_left; | 698 | goto fix_left; |
619 | 699 | ||
620 | if (bkey_next(k) == end(t->data)) { | 700 | if (bkey_next(k) == bset_bkey_last(t->data)) { |
621 | t->end = *k; | 701 | t->end = *k; |
622 | goto fix_right; | 702 | goto fix_right; |
623 | } | 703 | } |
@@ -642,10 +722,12 @@ fix_right: do { | |||
642 | j = j * 2 + 1; | 722 | j = j * 2 + 1; |
643 | } while (j < t->size); | 723 | } while (j < t->size); |
644 | } | 724 | } |
725 | EXPORT_SYMBOL(bch_bset_fix_invalidated_key); | ||
645 | 726 | ||
646 | void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) | 727 | static void bch_bset_fix_lookup_table(struct btree_keys *b, |
728 | struct bset_tree *t, | ||
729 | struct bkey *k) | ||
647 | { | 730 | { |
648 | struct bset_tree *t = &b->sets[b->nsets]; | ||
649 | unsigned shift = bkey_u64s(k); | 731 | unsigned shift = bkey_u64s(k); |
650 | unsigned j = bkey_to_cacheline(t, k); | 732 | unsigned j = bkey_to_cacheline(t, k); |
651 | 733 | ||
@@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) | |||
657 | * lookup table for the first key that is strictly greater than k: | 739 | * lookup table for the first key that is strictly greater than k: |
658 | * it's either k's cacheline or the next one | 740 | * it's either k's cacheline or the next one |
659 | */ | 741 | */ |
660 | if (j < t->size && | 742 | while (j < t->size && |
661 | table_to_bkey(t, j) <= k) | 743 | table_to_bkey(t, j) <= k) |
662 | j++; | 744 | j++; |
663 | 745 | ||
664 | /* Adjust all the lookup table entries, and find a new key for any that | 746 | /* Adjust all the lookup table entries, and find a new key for any that |
@@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) | |||
673 | while (k < cacheline_to_bkey(t, j, 0)) | 755 | while (k < cacheline_to_bkey(t, j, 0)) |
674 | k = bkey_next(k); | 756 | k = bkey_next(k); |
675 | 757 | ||
676 | t->prev[j] = bkey_to_cacheline_offset(k); | 758 | t->prev[j] = bkey_to_cacheline_offset(t, j, k); |
677 | } | 759 | } |
678 | } | 760 | } |
679 | 761 | ||
680 | if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) | 762 | if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) |
681 | return; | 763 | return; |
682 | 764 | ||
683 | /* Possibly add a new entry to the end of the lookup table */ | 765 | /* Possibly add a new entry to the end of the lookup table */ |
684 | 766 | ||
685 | for (k = table_to_bkey(t, t->size - 1); | 767 | for (k = table_to_bkey(t, t->size - 1); |
686 | k != end(t->data); | 768 | k != bset_bkey_last(t->data); |
687 | k = bkey_next(k)) | 769 | k = bkey_next(k)) |
688 | if (t->size == bkey_to_cacheline(t, k)) { | 770 | if (t->size == bkey_to_cacheline(t, k)) { |
689 | t->prev[t->size] = bkey_to_cacheline_offset(k); | 771 | t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); |
690 | t->size++; | 772 | t->size++; |
691 | } | 773 | } |
692 | } | 774 | } |
693 | 775 | ||
694 | void bch_bset_init_next(struct btree *b) | 776 | /* |
777 | * Tries to merge l and r: l should be lower than r | ||
778 | * Returns true if we were able to merge. If we did merge, l will be the merged | ||
779 | * key, r will be untouched. | ||
780 | */ | ||
781 | bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) | ||
695 | { | 782 | { |
696 | struct bset *i = write_block(b); | 783 | if (!b->ops->key_merge) |
784 | return false; | ||
697 | 785 | ||
698 | if (i != b->sets[0].data) { | 786 | /* |
699 | b->sets[++b->nsets].data = i; | 787 | * Generic header checks |
700 | i->seq = b->sets[0].data->seq; | 788 | * Assumes left and right are in order |
701 | } else | 789 | * Left and right must be exactly aligned |
702 | get_random_bytes(&i->seq, sizeof(uint64_t)); | 790 | */ |
791 | if (!bch_bkey_equal_header(l, r) || | ||
792 | bkey_cmp(l, &START_KEY(r))) | ||
793 | return false; | ||
703 | 794 | ||
704 | i->magic = bset_magic(&b->c->sb); | 795 | return b->ops->key_merge(b, l, r); |
705 | i->version = 0; | 796 | } |
706 | i->keys = 0; | 797 | EXPORT_SYMBOL(bch_bkey_try_merge); |
798 | |||
799 | void bch_bset_insert(struct btree_keys *b, struct bkey *where, | ||
800 | struct bkey *insert) | ||
801 | { | ||
802 | struct bset_tree *t = bset_tree_last(b); | ||
803 | |||
804 | BUG_ON(!b->last_set_unwritten); | ||
805 | BUG_ON(bset_byte_offset(b, t->data) + | ||
806 | __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > | ||
807 | PAGE_SIZE << b->page_order); | ||
808 | |||
809 | memmove((uint64_t *) where + bkey_u64s(insert), | ||
810 | where, | ||
811 | (void *) bset_bkey_last(t->data) - (void *) where); | ||
812 | |||
813 | t->data->keys += bkey_u64s(insert); | ||
814 | bkey_copy(where, insert); | ||
815 | bch_bset_fix_lookup_table(b, t, where); | ||
816 | } | ||
817 | EXPORT_SYMBOL(bch_bset_insert); | ||
818 | |||
819 | unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, | ||
820 | struct bkey *replace_key) | ||
821 | { | ||
822 | unsigned status = BTREE_INSERT_STATUS_NO_INSERT; | ||
823 | struct bset *i = bset_tree_last(b)->data; | ||
824 | struct bkey *m, *prev = NULL; | ||
825 | struct btree_iter iter; | ||
826 | |||
827 | BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); | ||
828 | |||
829 | m = bch_btree_iter_init(b, &iter, b->ops->is_extents | ||
830 | ? PRECEDING_KEY(&START_KEY(k)) | ||
831 | : PRECEDING_KEY(k)); | ||
832 | |||
833 | if (b->ops->insert_fixup(b, k, &iter, replace_key)) | ||
834 | return status; | ||
707 | 835 | ||
708 | bset_build_unwritten_tree(b); | 836 | status = BTREE_INSERT_STATUS_INSERT; |
837 | |||
838 | while (m != bset_bkey_last(i) && | ||
839 | bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) | ||
840 | prev = m, m = bkey_next(m); | ||
841 | |||
842 | /* prev is in the tree, if we merge we're done */ | ||
843 | status = BTREE_INSERT_STATUS_BACK_MERGE; | ||
844 | if (prev && | ||
845 | bch_bkey_try_merge(b, prev, k)) | ||
846 | goto merged; | ||
847 | #if 0 | ||
848 | status = BTREE_INSERT_STATUS_OVERWROTE; | ||
849 | if (m != bset_bkey_last(i) && | ||
850 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | ||
851 | goto copy; | ||
852 | #endif | ||
853 | status = BTREE_INSERT_STATUS_FRONT_MERGE; | ||
854 | if (m != bset_bkey_last(i) && | ||
855 | bch_bkey_try_merge(b, k, m)) | ||
856 | goto copy; | ||
857 | |||
858 | bch_bset_insert(b, m, k); | ||
859 | copy: bkey_copy(m, k); | ||
860 | merged: | ||
861 | return status; | ||
709 | } | 862 | } |
863 | EXPORT_SYMBOL(bch_btree_insert_key); | ||
864 | |||
865 | /* Lookup */ | ||
710 | 866 | ||
711 | struct bset_search_iter { | 867 | struct bset_search_iter { |
712 | struct bkey *l, *r; | 868 | struct bkey *l, *r; |
713 | }; | 869 | }; |
714 | 870 | ||
715 | static struct bset_search_iter bset_search_write_set(struct btree *b, | 871 | static struct bset_search_iter bset_search_write_set(struct bset_tree *t, |
716 | struct bset_tree *t, | ||
717 | const struct bkey *search) | 872 | const struct bkey *search) |
718 | { | 873 | { |
719 | unsigned li = 0, ri = t->size; | 874 | unsigned li = 0, ri = t->size; |
720 | 875 | ||
721 | BUG_ON(!b->nsets && | ||
722 | t->size < bkey_to_cacheline(t, end(t->data))); | ||
723 | |||
724 | while (li + 1 != ri) { | 876 | while (li + 1 != ri) { |
725 | unsigned m = (li + ri) >> 1; | 877 | unsigned m = (li + ri) >> 1; |
726 | 878 | ||
@@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b, | |||
732 | 884 | ||
733 | return (struct bset_search_iter) { | 885 | return (struct bset_search_iter) { |
734 | table_to_bkey(t, li), | 886 | table_to_bkey(t, li), |
735 | ri < t->size ? table_to_bkey(t, ri) : end(t->data) | 887 | ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) |
736 | }; | 888 | }; |
737 | } | 889 | } |
738 | 890 | ||
739 | static struct bset_search_iter bset_search_tree(struct btree *b, | 891 | static struct bset_search_iter bset_search_tree(struct bset_tree *t, |
740 | struct bset_tree *t, | ||
741 | const struct bkey *search) | 892 | const struct bkey *search) |
742 | { | 893 | { |
743 | struct bkey *l, *r; | 894 | struct bkey *l, *r; |
@@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, | |||
784 | f = &t->tree[inorder_next(j, t->size)]; | 935 | f = &t->tree[inorder_next(j, t->size)]; |
785 | r = cacheline_to_bkey(t, inorder, f->m); | 936 | r = cacheline_to_bkey(t, inorder, f->m); |
786 | } else | 937 | } else |
787 | r = end(t->data); | 938 | r = bset_bkey_last(t->data); |
788 | } else { | 939 | } else { |
789 | r = cacheline_to_bkey(t, inorder, f->m); | 940 | r = cacheline_to_bkey(t, inorder, f->m); |
790 | 941 | ||
@@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, | |||
798 | return (struct bset_search_iter) {l, r}; | 949 | return (struct bset_search_iter) {l, r}; |
799 | } | 950 | } |
800 | 951 | ||
801 | struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | 952 | struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, |
802 | const struct bkey *search) | 953 | const struct bkey *search) |
803 | { | 954 | { |
804 | struct bset_search_iter i; | 955 | struct bset_search_iter i; |
@@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | |||
820 | 971 | ||
821 | if (unlikely(!t->size)) { | 972 | if (unlikely(!t->size)) { |
822 | i.l = t->data->start; | 973 | i.l = t->data->start; |
823 | i.r = end(t->data); | 974 | i.r = bset_bkey_last(t->data); |
824 | } else if (bset_written(b, t)) { | 975 | } else if (bset_written(b, t)) { |
825 | /* | 976 | /* |
826 | * Each node in the auxiliary search tree covers a certain range | 977 | * Each node in the auxiliary search tree covers a certain range |
@@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | |||
830 | */ | 981 | */ |
831 | 982 | ||
832 | if (unlikely(bkey_cmp(search, &t->end) >= 0)) | 983 | if (unlikely(bkey_cmp(search, &t->end) >= 0)) |
833 | return end(t->data); | 984 | return bset_bkey_last(t->data); |
834 | 985 | ||
835 | if (unlikely(bkey_cmp(search, t->data->start) < 0)) | 986 | if (unlikely(bkey_cmp(search, t->data->start) < 0)) |
836 | return t->data->start; | 987 | return t->data->start; |
837 | 988 | ||
838 | i = bset_search_tree(b, t, search); | 989 | i = bset_search_tree(t, search); |
839 | } else | 990 | } else { |
840 | i = bset_search_write_set(b, t, search); | 991 | BUG_ON(!b->nsets && |
992 | t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); | ||
841 | 993 | ||
842 | if (expensive_debug_checks(b->c)) { | 994 | i = bset_search_write_set(t, search); |
995 | } | ||
996 | |||
997 | if (btree_keys_expensive_checks(b)) { | ||
843 | BUG_ON(bset_written(b, t) && | 998 | BUG_ON(bset_written(b, t) && |
844 | i.l != t->data->start && | 999 | i.l != t->data->start && |
845 | bkey_cmp(tree_to_prev_bkey(t, | 1000 | bkey_cmp(tree_to_prev_bkey(t, |
846 | inorder_to_tree(bkey_to_cacheline(t, i.l), t)), | 1001 | inorder_to_tree(bkey_to_cacheline(t, i.l), t)), |
847 | search) > 0); | 1002 | search) > 0); |
848 | 1003 | ||
849 | BUG_ON(i.r != end(t->data) && | 1004 | BUG_ON(i.r != bset_bkey_last(t->data) && |
850 | bkey_cmp(i.r, search) <= 0); | 1005 | bkey_cmp(i.r, search) <= 0); |
851 | } | 1006 | } |
852 | 1007 | ||
@@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | |||
856 | 1011 | ||
857 | return i.l; | 1012 | return i.l; |
858 | } | 1013 | } |
1014 | EXPORT_SYMBOL(__bch_bset_search); | ||
859 | 1015 | ||
860 | /* Btree iterator */ | 1016 | /* Btree iterator */ |
861 | 1017 | ||
862 | /* | 1018 | typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, |
863 | * Returns true if l > r - unless l == r, in which case returns true if l is | 1019 | struct btree_iter_set); |
864 | * older than r. | 1020 | |
865 | * | ||
866 | * Necessary for btree_sort_fixup() - if there are multiple keys that compare | ||
867 | * equal in different sets, we have to process them newest to oldest. | ||
868 | */ | ||
869 | static inline bool btree_iter_cmp(struct btree_iter_set l, | 1021 | static inline bool btree_iter_cmp(struct btree_iter_set l, |
870 | struct btree_iter_set r) | 1022 | struct btree_iter_set r) |
871 | { | 1023 | { |
872 | int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); | 1024 | return bkey_cmp(l.k, r.k) > 0; |
873 | |||
874 | return c ? c > 0 : l.k < r.k; | ||
875 | } | 1025 | } |
876 | 1026 | ||
877 | static inline bool btree_iter_end(struct btree_iter *iter) | 1027 | static inline bool btree_iter_end(struct btree_iter *iter) |
@@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, | |||
888 | btree_iter_cmp)); | 1038 | btree_iter_cmp)); |
889 | } | 1039 | } |
890 | 1040 | ||
891 | struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, | 1041 | static struct bkey *__bch_btree_iter_init(struct btree_keys *b, |
892 | struct bkey *search, struct bset_tree *start) | 1042 | struct btree_iter *iter, |
1043 | struct bkey *search, | ||
1044 | struct bset_tree *start) | ||
893 | { | 1045 | { |
894 | struct bkey *ret = NULL; | 1046 | struct bkey *ret = NULL; |
895 | iter->size = ARRAY_SIZE(iter->data); | 1047 | iter->size = ARRAY_SIZE(iter->data); |
@@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, | |||
899 | iter->b = b; | 1051 | iter->b = b; |
900 | #endif | 1052 | #endif |
901 | 1053 | ||
902 | for (; start <= &b->sets[b->nsets]; start++) { | 1054 | for (; start <= bset_tree_last(b); start++) { |
903 | ret = bch_bset_search(b, start, search); | 1055 | ret = bch_bset_search(b, start, search); |
904 | bch_btree_iter_push(iter, ret, end(start->data)); | 1056 | bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); |
905 | } | 1057 | } |
906 | 1058 | ||
907 | return ret; | 1059 | return ret; |
908 | } | 1060 | } |
909 | 1061 | ||
910 | struct bkey *bch_btree_iter_next(struct btree_iter *iter) | 1062 | struct bkey *bch_btree_iter_init(struct btree_keys *b, |
1063 | struct btree_iter *iter, | ||
1064 | struct bkey *search) | ||
1065 | { | ||
1066 | return __bch_btree_iter_init(b, iter, search, b->set); | ||
1067 | } | ||
1068 | EXPORT_SYMBOL(bch_btree_iter_init); | ||
1069 | |||
1070 | static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, | ||
1071 | btree_iter_cmp_fn *cmp) | ||
911 | { | 1072 | { |
912 | struct btree_iter_set unused; | 1073 | struct btree_iter_set unused; |
913 | struct bkey *ret = NULL; | 1074 | struct bkey *ret = NULL; |
@@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) | |||
924 | } | 1085 | } |
925 | 1086 | ||
926 | if (iter->data->k == iter->data->end) | 1087 | if (iter->data->k == iter->data->end) |
927 | heap_pop(iter, unused, btree_iter_cmp); | 1088 | heap_pop(iter, unused, cmp); |
928 | else | 1089 | else |
929 | heap_sift(iter, 0, btree_iter_cmp); | 1090 | heap_sift(iter, 0, cmp); |
930 | } | 1091 | } |
931 | 1092 | ||
932 | return ret; | 1093 | return ret; |
933 | } | 1094 | } |
934 | 1095 | ||
1096 | struct bkey *bch_btree_iter_next(struct btree_iter *iter) | ||
1097 | { | ||
1098 | return __bch_btree_iter_next(iter, btree_iter_cmp); | ||
1099 | |||
1100 | } | ||
1101 | EXPORT_SYMBOL(bch_btree_iter_next); | ||
1102 | |||
935 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, | 1103 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, |
936 | struct btree *b, ptr_filter_fn fn) | 1104 | struct btree_keys *b, ptr_filter_fn fn) |
937 | { | 1105 | { |
938 | struct bkey *ret; | 1106 | struct bkey *ret; |
939 | 1107 | ||
@@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, | |||
946 | 1114 | ||
947 | /* Mergesort */ | 1115 | /* Mergesort */ |
948 | 1116 | ||
949 | static void sort_key_next(struct btree_iter *iter, | 1117 | void bch_bset_sort_state_free(struct bset_sort_state *state) |
950 | struct btree_iter_set *i) | ||
951 | { | 1118 | { |
952 | i->k = bkey_next(i->k); | 1119 | if (state->pool) |
953 | 1120 | mempool_destroy(state->pool); | |
954 | if (i->k == i->end) | ||
955 | *i = iter->data[--iter->used]; | ||
956 | } | 1121 | } |
957 | 1122 | ||
958 | static void btree_sort_fixup(struct btree_iter *iter) | 1123 | int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) |
959 | { | 1124 | { |
960 | while (iter->used > 1) { | 1125 | spin_lock_init(&state->time.lock); |
961 | struct btree_iter_set *top = iter->data, *i = top + 1; | ||
962 | |||
963 | if (iter->used > 2 && | ||
964 | btree_iter_cmp(i[0], i[1])) | ||
965 | i++; | ||
966 | |||
967 | if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) | ||
968 | break; | ||
969 | 1126 | ||
970 | if (!KEY_SIZE(i->k)) { | 1127 | state->page_order = page_order; |
971 | sort_key_next(iter, i); | 1128 | state->crit_factor = int_sqrt(1 << page_order); |
972 | heap_sift(iter, i - top, btree_iter_cmp); | ||
973 | continue; | ||
974 | } | ||
975 | 1129 | ||
976 | if (top->k > i->k) { | 1130 | state->pool = mempool_create_page_pool(1, page_order); |
977 | if (bkey_cmp(top->k, i->k) >= 0) | 1131 | if (!state->pool) |
978 | sort_key_next(iter, i); | 1132 | return -ENOMEM; |
979 | else | ||
980 | bch_cut_front(top->k, i->k); | ||
981 | 1133 | ||
982 | heap_sift(iter, i - top, btree_iter_cmp); | 1134 | return 0; |
983 | } else { | ||
984 | /* can't happen because of comparison func */ | ||
985 | BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); | ||
986 | bch_cut_back(&START_KEY(i->k), top->k); | ||
987 | } | ||
988 | } | ||
989 | } | 1135 | } |
1136 | EXPORT_SYMBOL(bch_bset_sort_state_init); | ||
990 | 1137 | ||
991 | static void btree_mergesort(struct btree *b, struct bset *out, | 1138 | static void btree_mergesort(struct btree_keys *b, struct bset *out, |
992 | struct btree_iter *iter, | 1139 | struct btree_iter *iter, |
993 | bool fixup, bool remove_stale) | 1140 | bool fixup, bool remove_stale) |
994 | { | 1141 | { |
1142 | int i; | ||
995 | struct bkey *k, *last = NULL; | 1143 | struct bkey *k, *last = NULL; |
996 | bool (*bad)(struct btree *, const struct bkey *) = remove_stale | 1144 | BKEY_PADDED(k) tmp; |
1145 | bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale | ||
997 | ? bch_ptr_bad | 1146 | ? bch_ptr_bad |
998 | : bch_ptr_invalid; | 1147 | : bch_ptr_invalid; |
999 | 1148 | ||
1149 | /* Heapify the iterator, using our comparison function */ | ||
1150 | for (i = iter->used / 2 - 1; i >= 0; --i) | ||
1151 | heap_sift(iter, i, b->ops->sort_cmp); | ||
1152 | |||
1000 | while (!btree_iter_end(iter)) { | 1153 | while (!btree_iter_end(iter)) { |
1001 | if (fixup && !b->level) | 1154 | if (b->ops->sort_fixup && fixup) |
1002 | btree_sort_fixup(iter); | 1155 | k = b->ops->sort_fixup(iter, &tmp.k); |
1156 | else | ||
1157 | k = NULL; | ||
1158 | |||
1159 | if (!k) | ||
1160 | k = __bch_btree_iter_next(iter, b->ops->sort_cmp); | ||
1003 | 1161 | ||
1004 | k = bch_btree_iter_next(iter); | ||
1005 | if (bad(b, k)) | 1162 | if (bad(b, k)) |
1006 | continue; | 1163 | continue; |
1007 | 1164 | ||
1008 | if (!last) { | 1165 | if (!last) { |
1009 | last = out->start; | 1166 | last = out->start; |
1010 | bkey_copy(last, k); | 1167 | bkey_copy(last, k); |
1011 | } else if (b->level || | 1168 | } else if (!bch_bkey_try_merge(b, last, k)) { |
1012 | !bch_bkey_try_merge(b, last, k)) { | ||
1013 | last = bkey_next(last); | 1169 | last = bkey_next(last); |
1014 | bkey_copy(last, k); | 1170 | bkey_copy(last, k); |
1015 | } | 1171 | } |
@@ -1020,27 +1176,30 @@ static void btree_mergesort(struct btree *b, struct bset *out, | |||
1020 | pr_debug("sorted %i keys", out->keys); | 1176 | pr_debug("sorted %i keys", out->keys); |
1021 | } | 1177 | } |
1022 | 1178 | ||
1023 | static void __btree_sort(struct btree *b, struct btree_iter *iter, | 1179 | static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, |
1024 | unsigned start, unsigned order, bool fixup) | 1180 | unsigned start, unsigned order, bool fixup, |
1181 | struct bset_sort_state *state) | ||
1025 | { | 1182 | { |
1026 | uint64_t start_time; | 1183 | uint64_t start_time; |
1027 | bool remove_stale = !b->written; | 1184 | bool used_mempool = false; |
1028 | struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, | 1185 | struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, |
1029 | order); | 1186 | order); |
1030 | if (!out) { | 1187 | if (!out) { |
1031 | mutex_lock(&b->c->sort_lock); | 1188 | struct page *outp; |
1032 | out = b->c->sort; | 1189 | |
1033 | order = ilog2(bucket_pages(b->c)); | 1190 | BUG_ON(order > state->page_order); |
1191 | |||
1192 | outp = mempool_alloc(state->pool, GFP_NOIO); | ||
1193 | out = page_address(outp); | ||
1194 | used_mempool = true; | ||
1195 | order = state->page_order; | ||
1034 | } | 1196 | } |
1035 | 1197 | ||
1036 | start_time = local_clock(); | 1198 | start_time = local_clock(); |
1037 | 1199 | ||
1038 | btree_mergesort(b, out, iter, fixup, remove_stale); | 1200 | btree_mergesort(b, out, iter, fixup, false); |
1039 | b->nsets = start; | 1201 | b->nsets = start; |
1040 | 1202 | ||
1041 | if (!fixup && !start && b->written) | ||
1042 | bch_btree_verify(b, out); | ||
1043 | |||
1044 | if (!start && order == b->page_order) { | 1203 | if (!start && order == b->page_order) { |
1045 | /* | 1204 | /* |
1046 | * Our temporary buffer is the same size as the btree node's | 1205 | * Our temporary buffer is the same size as the btree node's |
@@ -1048,84 +1207,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, | |||
1048 | * memcpy() | 1207 | * memcpy() |
1049 | */ | 1208 | */ |
1050 | 1209 | ||
1051 | out->magic = bset_magic(&b->c->sb); | 1210 | out->magic = b->set->data->magic; |
1052 | out->seq = b->sets[0].data->seq; | 1211 | out->seq = b->set->data->seq; |
1053 | out->version = b->sets[0].data->version; | 1212 | out->version = b->set->data->version; |
1054 | swap(out, b->sets[0].data); | 1213 | swap(out, b->set->data); |
1055 | |||
1056 | if (b->c->sort == b->sets[0].data) | ||
1057 | b->c->sort = out; | ||
1058 | } else { | 1214 | } else { |
1059 | b->sets[start].data->keys = out->keys; | 1215 | b->set[start].data->keys = out->keys; |
1060 | memcpy(b->sets[start].data->start, out->start, | 1216 | memcpy(b->set[start].data->start, out->start, |
1061 | (void *) end(out) - (void *) out->start); | 1217 | (void *) bset_bkey_last(out) - (void *) out->start); |
1062 | } | 1218 | } |
1063 | 1219 | ||
1064 | if (out == b->c->sort) | 1220 | if (used_mempool) |
1065 | mutex_unlock(&b->c->sort_lock); | 1221 | mempool_free(virt_to_page(out), state->pool); |
1066 | else | 1222 | else |
1067 | free_pages((unsigned long) out, order); | 1223 | free_pages((unsigned long) out, order); |
1068 | 1224 | ||
1069 | if (b->written) | 1225 | bch_bset_build_written_tree(b); |
1070 | bset_build_written_tree(b); | ||
1071 | 1226 | ||
1072 | if (!start) | 1227 | if (!start) |
1073 | bch_time_stats_update(&b->c->sort_time, start_time); | 1228 | bch_time_stats_update(&state->time, start_time); |
1074 | } | 1229 | } |
1075 | 1230 | ||
1076 | void bch_btree_sort_partial(struct btree *b, unsigned start) | 1231 | void bch_btree_sort_partial(struct btree_keys *b, unsigned start, |
1232 | struct bset_sort_state *state) | ||
1077 | { | 1233 | { |
1078 | size_t order = b->page_order, keys = 0; | 1234 | size_t order = b->page_order, keys = 0; |
1079 | struct btree_iter iter; | 1235 | struct btree_iter iter; |
1080 | int oldsize = bch_count_data(b); | 1236 | int oldsize = bch_count_data(b); |
1081 | 1237 | ||
1082 | __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); | 1238 | __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); |
1083 | |||
1084 | BUG_ON(b->sets[b->nsets].data == write_block(b) && | ||
1085 | (b->sets[b->nsets].size || b->nsets)); | ||
1086 | |||
1087 | 1239 | ||
1088 | if (start) { | 1240 | if (start) { |
1089 | unsigned i; | 1241 | unsigned i; |
1090 | 1242 | ||
1091 | for (i = start; i <= b->nsets; i++) | 1243 | for (i = start; i <= b->nsets; i++) |
1092 | keys += b->sets[i].data->keys; | 1244 | keys += b->set[i].data->keys; |
1093 | 1245 | ||
1094 | order = roundup_pow_of_two(__set_bytes(b->sets->data, | 1246 | order = get_order(__set_bytes(b->set->data, keys)); |
1095 | keys)) / PAGE_SIZE; | ||
1096 | if (order) | ||
1097 | order = ilog2(order); | ||
1098 | } | 1247 | } |
1099 | 1248 | ||
1100 | __btree_sort(b, &iter, start, order, false); | 1249 | __btree_sort(b, &iter, start, order, false, state); |
1101 | 1250 | ||
1102 | EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); | 1251 | EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); |
1103 | } | 1252 | } |
1253 | EXPORT_SYMBOL(bch_btree_sort_partial); | ||
1104 | 1254 | ||
1105 | void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) | 1255 | void bch_btree_sort_and_fix_extents(struct btree_keys *b, |
1256 | struct btree_iter *iter, | ||
1257 | struct bset_sort_state *state) | ||
1106 | { | 1258 | { |
1107 | BUG_ON(!b->written); | 1259 | __btree_sort(b, iter, 0, b->page_order, true, state); |
1108 | __btree_sort(b, iter, 0, b->page_order, true); | ||
1109 | } | 1260 | } |
1110 | 1261 | ||
1111 | void bch_btree_sort_into(struct btree *b, struct btree *new) | 1262 | void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, |
1263 | struct bset_sort_state *state) | ||
1112 | { | 1264 | { |
1113 | uint64_t start_time = local_clock(); | 1265 | uint64_t start_time = local_clock(); |
1114 | 1266 | ||
1115 | struct btree_iter iter; | 1267 | struct btree_iter iter; |
1116 | bch_btree_iter_init(b, &iter, NULL); | 1268 | bch_btree_iter_init(b, &iter, NULL); |
1117 | 1269 | ||
1118 | btree_mergesort(b, new->sets->data, &iter, false, true); | 1270 | btree_mergesort(b, new->set->data, &iter, false, true); |
1119 | 1271 | ||
1120 | bch_time_stats_update(&b->c->sort_time, start_time); | 1272 | bch_time_stats_update(&state->time, start_time); |
1121 | 1273 | ||
1122 | bkey_copy_key(&new->key, &b->key); | 1274 | new->set->size = 0; // XXX: why? |
1123 | new->sets->size = 0; | ||
1124 | } | 1275 | } |
1125 | 1276 | ||
1126 | #define SORT_CRIT (4096 / sizeof(uint64_t)) | 1277 | #define SORT_CRIT (4096 / sizeof(uint64_t)) |
1127 | 1278 | ||
1128 | void bch_btree_sort_lazy(struct btree *b) | 1279 | void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) |
1129 | { | 1280 | { |
1130 | unsigned crit = SORT_CRIT; | 1281 | unsigned crit = SORT_CRIT; |
1131 | int i; | 1282 | int i; |
@@ -1134,50 +1285,32 @@ void bch_btree_sort_lazy(struct btree *b) | |||
1134 | if (!b->nsets) | 1285 | if (!b->nsets) |
1135 | goto out; | 1286 | goto out; |
1136 | 1287 | ||
1137 | /* If not a leaf node, always sort */ | ||
1138 | if (b->level) { | ||
1139 | bch_btree_sort(b); | ||
1140 | return; | ||
1141 | } | ||
1142 | |||
1143 | for (i = b->nsets - 1; i >= 0; --i) { | 1288 | for (i = b->nsets - 1; i >= 0; --i) { |
1144 | crit *= b->c->sort_crit_factor; | 1289 | crit *= state->crit_factor; |
1145 | 1290 | ||
1146 | if (b->sets[i].data->keys < crit) { | 1291 | if (b->set[i].data->keys < crit) { |
1147 | bch_btree_sort_partial(b, i); | 1292 | bch_btree_sort_partial(b, i, state); |
1148 | return; | 1293 | return; |
1149 | } | 1294 | } |
1150 | } | 1295 | } |
1151 | 1296 | ||
1152 | /* Sort if we'd overflow */ | 1297 | /* Sort if we'd overflow */ |
1153 | if (b->nsets + 1 == MAX_BSETS) { | 1298 | if (b->nsets + 1 == MAX_BSETS) { |
1154 | bch_btree_sort(b); | 1299 | bch_btree_sort(b, state); |
1155 | return; | 1300 | return; |
1156 | } | 1301 | } |
1157 | 1302 | ||
1158 | out: | 1303 | out: |
1159 | bset_build_written_tree(b); | 1304 | bch_bset_build_written_tree(b); |
1160 | } | 1305 | } |
1306 | EXPORT_SYMBOL(bch_btree_sort_lazy); | ||
1161 | 1307 | ||
1162 | /* Sysfs stuff */ | 1308 | void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) |
1163 | |||
1164 | struct bset_stats { | ||
1165 | struct btree_op op; | ||
1166 | size_t nodes; | ||
1167 | size_t sets_written, sets_unwritten; | ||
1168 | size_t bytes_written, bytes_unwritten; | ||
1169 | size_t floats, failed; | ||
1170 | }; | ||
1171 | |||
1172 | static int btree_bset_stats(struct btree_op *op, struct btree *b) | ||
1173 | { | 1309 | { |
1174 | struct bset_stats *stats = container_of(op, struct bset_stats, op); | ||
1175 | unsigned i; | 1310 | unsigned i; |
1176 | 1311 | ||
1177 | stats->nodes++; | ||
1178 | |||
1179 | for (i = 0; i <= b->nsets; i++) { | 1312 | for (i = 0; i <= b->nsets; i++) { |
1180 | struct bset_tree *t = &b->sets[i]; | 1313 | struct bset_tree *t = &b->set[i]; |
1181 | size_t bytes = t->data->keys * sizeof(uint64_t); | 1314 | size_t bytes = t->data->keys * sizeof(uint64_t); |
1182 | size_t j; | 1315 | size_t j; |
1183 | 1316 | ||
@@ -1195,32 +1328,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b) | |||
1195 | stats->bytes_unwritten += bytes; | 1328 | stats->bytes_unwritten += bytes; |
1196 | } | 1329 | } |
1197 | } | 1330 | } |
1198 | |||
1199 | return MAP_CONTINUE; | ||
1200 | } | ||
1201 | |||
1202 | int bch_bset_print_stats(struct cache_set *c, char *buf) | ||
1203 | { | ||
1204 | struct bset_stats t; | ||
1205 | int ret; | ||
1206 | |||
1207 | memset(&t, 0, sizeof(struct bset_stats)); | ||
1208 | bch_btree_op_init(&t.op, -1); | ||
1209 | |||
1210 | ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); | ||
1211 | if (ret < 0) | ||
1212 | return ret; | ||
1213 | |||
1214 | return snprintf(buf, PAGE_SIZE, | ||
1215 | "btree nodes: %zu\n" | ||
1216 | "written sets: %zu\n" | ||
1217 | "unwritten sets: %zu\n" | ||
1218 | "written key bytes: %zu\n" | ||
1219 | "unwritten key bytes: %zu\n" | ||
1220 | "floats: %zu\n" | ||
1221 | "failed: %zu\n", | ||
1222 | t.nodes, | ||
1223 | t.sets_written, t.sets_unwritten, | ||
1224 | t.bytes_written, t.bytes_unwritten, | ||
1225 | t.floats, t.failed); | ||
1226 | } | 1331 | } |
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 1d3c24f9fa0e..003260f4ddf6 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h | |||
@@ -1,7 +1,11 @@ | |||
1 | #ifndef _BCACHE_BSET_H | 1 | #ifndef _BCACHE_BSET_H |
2 | #define _BCACHE_BSET_H | 2 | #define _BCACHE_BSET_H |
3 | 3 | ||
4 | #include <linux/slab.h> | 4 | #include <linux/bcache.h> |
5 | #include <linux/kernel.h> | ||
6 | #include <linux/types.h> | ||
7 | |||
8 | #include "util.h" /* for time_stats */ | ||
5 | 9 | ||
6 | /* | 10 | /* |
7 | * BKEYS: | 11 | * BKEYS: |
@@ -142,20 +146,13 @@ | |||
142 | * first key in that range of bytes again. | 146 | * first key in that range of bytes again. |
143 | */ | 147 | */ |
144 | 148 | ||
145 | /* Btree key comparison/iteration */ | 149 | struct btree_keys; |
150 | struct btree_iter; | ||
151 | struct btree_iter_set; | ||
152 | struct bkey_float; | ||
146 | 153 | ||
147 | #define MAX_BSETS 4U | 154 | #define MAX_BSETS 4U |
148 | 155 | ||
149 | struct btree_iter { | ||
150 | size_t size, used; | ||
151 | #ifdef CONFIG_BCACHE_DEBUG | ||
152 | struct btree *b; | ||
153 | #endif | ||
154 | struct btree_iter_set { | ||
155 | struct bkey *k, *end; | ||
156 | } data[MAX_BSETS]; | ||
157 | }; | ||
158 | |||
159 | struct bset_tree { | 156 | struct bset_tree { |
160 | /* | 157 | /* |
161 | * We construct a binary tree in an array as if the array | 158 | * We construct a binary tree in an array as if the array |
@@ -165,14 +162,14 @@ struct bset_tree { | |||
165 | */ | 162 | */ |
166 | 163 | ||
167 | /* size of the binary tree and prev array */ | 164 | /* size of the binary tree and prev array */ |
168 | unsigned size; | 165 | unsigned size; |
169 | 166 | ||
170 | /* function of size - precalculated for to_inorder() */ | 167 | /* function of size - precalculated for to_inorder() */ |
171 | unsigned extra; | 168 | unsigned extra; |
172 | 169 | ||
173 | /* copy of the last key in the set */ | 170 | /* copy of the last key in the set */ |
174 | struct bkey end; | 171 | struct bkey end; |
175 | struct bkey_float *tree; | 172 | struct bkey_float *tree; |
176 | 173 | ||
177 | /* | 174 | /* |
178 | * The nodes in the bset tree point to specific keys - this | 175 | * The nodes in the bset tree point to specific keys - this |
@@ -182,12 +179,219 @@ struct bset_tree { | |||
182 | * to keep bkey_float to 4 bytes and prev isn't used in the fast | 179 | * to keep bkey_float to 4 bytes and prev isn't used in the fast |
183 | * path. | 180 | * path. |
184 | */ | 181 | */ |
185 | uint8_t *prev; | 182 | uint8_t *prev; |
186 | 183 | ||
187 | /* The actual btree node, with pointers to each sorted set */ | 184 | /* The actual btree node, with pointers to each sorted set */ |
188 | struct bset *data; | 185 | struct bset *data; |
186 | }; | ||
187 | |||
188 | struct btree_keys_ops { | ||
189 | bool (*sort_cmp)(struct btree_iter_set, | ||
190 | struct btree_iter_set); | ||
191 | struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); | ||
192 | bool (*insert_fixup)(struct btree_keys *, struct bkey *, | ||
193 | struct btree_iter *, struct bkey *); | ||
194 | bool (*key_invalid)(struct btree_keys *, | ||
195 | const struct bkey *); | ||
196 | bool (*key_bad)(struct btree_keys *, const struct bkey *); | ||
197 | bool (*key_merge)(struct btree_keys *, | ||
198 | struct bkey *, struct bkey *); | ||
199 | void (*key_to_text)(char *, size_t, const struct bkey *); | ||
200 | void (*key_dump)(struct btree_keys *, const struct bkey *); | ||
201 | |||
202 | /* | ||
203 | * Only used for deciding whether to use START_KEY(k) or just the key | ||
204 | * itself in a couple places | ||
205 | */ | ||
206 | bool is_extents; | ||
207 | }; | ||
208 | |||
209 | struct btree_keys { | ||
210 | const struct btree_keys_ops *ops; | ||
211 | uint8_t page_order; | ||
212 | uint8_t nsets; | ||
213 | unsigned last_set_unwritten:1; | ||
214 | bool *expensive_debug_checks; | ||
215 | |||
216 | /* | ||
217 | * Sets of sorted keys - the real btree node - plus a binary search tree | ||
218 | * | ||
219 | * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||
220 | * to the memory we have allocated for this btree node. Additionally, | ||
221 | * set[0]->data points to the entire btree node as it exists on disk. | ||
222 | */ | ||
223 | struct bset_tree set[MAX_BSETS]; | ||
224 | }; | ||
225 | |||
226 | static inline struct bset_tree *bset_tree_last(struct btree_keys *b) | ||
227 | { | ||
228 | return b->set + b->nsets; | ||
229 | } | ||
230 | |||
231 | static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) | ||
232 | { | ||
233 | return t <= b->set + b->nsets - b->last_set_unwritten; | ||
234 | } | ||
235 | |||
236 | static inline bool bkey_written(struct btree_keys *b, struct bkey *k) | ||
237 | { | ||
238 | return !b->last_set_unwritten || k < b->set[b->nsets].data->start; | ||
239 | } | ||
240 | |||
241 | static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) | ||
242 | { | ||
243 | return ((size_t) i) - ((size_t) b->set->data); | ||
244 | } | ||
245 | |||
246 | static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) | ||
247 | { | ||
248 | return bset_byte_offset(b, i) >> 9; | ||
249 | } | ||
250 | |||
251 | #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) | ||
252 | #define set_bytes(i) __set_bytes(i, i->keys) | ||
253 | |||
254 | #define __set_blocks(i, k, block_bytes) \ | ||
255 | DIV_ROUND_UP(__set_bytes(i, k), block_bytes) | ||
256 | #define set_blocks(i, block_bytes) \ | ||
257 | __set_blocks(i, (i)->keys, block_bytes) | ||
258 | |||
259 | static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) | ||
260 | { | ||
261 | struct bset_tree *t = bset_tree_last(b); | ||
262 | |||
263 | BUG_ON((PAGE_SIZE << b->page_order) < | ||
264 | (bset_byte_offset(b, t->data) + set_bytes(t->data))); | ||
265 | |||
266 | if (!b->last_set_unwritten) | ||
267 | return 0; | ||
268 | |||
269 | return ((PAGE_SIZE << b->page_order) - | ||
270 | (bset_byte_offset(b, t->data) + set_bytes(t->data))) / | ||
271 | sizeof(u64); | ||
272 | } | ||
273 | |||
274 | static inline struct bset *bset_next_set(struct btree_keys *b, | ||
275 | unsigned block_bytes) | ||
276 | { | ||
277 | struct bset *i = bset_tree_last(b)->data; | ||
278 | |||
279 | return ((void *) i) + roundup(set_bytes(i), block_bytes); | ||
280 | } | ||
281 | |||
282 | void bch_btree_keys_free(struct btree_keys *); | ||
283 | int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); | ||
284 | void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, | ||
285 | bool *); | ||
286 | |||
287 | void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); | ||
288 | void bch_bset_build_written_tree(struct btree_keys *); | ||
289 | void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); | ||
290 | bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); | ||
291 | void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); | ||
292 | unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, | ||
293 | struct bkey *); | ||
294 | |||
295 | enum { | ||
296 | BTREE_INSERT_STATUS_NO_INSERT = 0, | ||
297 | BTREE_INSERT_STATUS_INSERT, | ||
298 | BTREE_INSERT_STATUS_BACK_MERGE, | ||
299 | BTREE_INSERT_STATUS_OVERWROTE, | ||
300 | BTREE_INSERT_STATUS_FRONT_MERGE, | ||
189 | }; | 301 | }; |
190 | 302 | ||
303 | /* Btree key iteration */ | ||
304 | |||
305 | struct btree_iter { | ||
306 | size_t size, used; | ||
307 | #ifdef CONFIG_BCACHE_DEBUG | ||
308 | struct btree_keys *b; | ||
309 | #endif | ||
310 | struct btree_iter_set { | ||
311 | struct bkey *k, *end; | ||
312 | } data[MAX_BSETS]; | ||
313 | }; | ||
314 | |||
315 | typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); | ||
316 | |||
317 | struct bkey *bch_btree_iter_next(struct btree_iter *); | ||
318 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||
319 | struct btree_keys *, ptr_filter_fn); | ||
320 | |||
321 | void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||
322 | struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, | ||
323 | struct bkey *); | ||
324 | |||
325 | struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, | ||
326 | const struct bkey *); | ||
327 | |||
328 | /* | ||
329 | * Returns the first key that is strictly greater than search | ||
330 | */ | ||
331 | static inline struct bkey *bch_bset_search(struct btree_keys *b, | ||
332 | struct bset_tree *t, | ||
333 | const struct bkey *search) | ||
334 | { | ||
335 | return search ? __bch_bset_search(b, t, search) : t->data->start; | ||
336 | } | ||
337 | |||
338 | #define for_each_key_filter(b, k, iter, filter) \ | ||
339 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
340 | ((k) = bch_btree_iter_next_filter((iter), (b), filter));) | ||
341 | |||
342 | #define for_each_key(b, k, iter) \ | ||
343 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
344 | ((k) = bch_btree_iter_next(iter));) | ||
345 | |||
346 | /* Sorting */ | ||
347 | |||
348 | struct bset_sort_state { | ||
349 | mempool_t *pool; | ||
350 | |||
351 | unsigned page_order; | ||
352 | unsigned crit_factor; | ||
353 | |||
354 | struct time_stats time; | ||
355 | }; | ||
356 | |||
357 | void bch_bset_sort_state_free(struct bset_sort_state *); | ||
358 | int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); | ||
359 | void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); | ||
360 | void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, | ||
361 | struct bset_sort_state *); | ||
362 | void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, | ||
363 | struct bset_sort_state *); | ||
364 | void bch_btree_sort_partial(struct btree_keys *, unsigned, | ||
365 | struct bset_sort_state *); | ||
366 | |||
367 | static inline void bch_btree_sort(struct btree_keys *b, | ||
368 | struct bset_sort_state *state) | ||
369 | { | ||
370 | bch_btree_sort_partial(b, 0, state); | ||
371 | } | ||
372 | |||
373 | struct bset_stats { | ||
374 | size_t sets_written, sets_unwritten; | ||
375 | size_t bytes_written, bytes_unwritten; | ||
376 | size_t floats, failed; | ||
377 | }; | ||
378 | |||
379 | void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); | ||
380 | |||
381 | /* Bkey utility code */ | ||
382 | |||
383 | #define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) | ||
384 | |||
385 | static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) | ||
386 | { | ||
387 | return bkey_idx(i->start, idx); | ||
388 | } | ||
389 | |||
390 | static inline void bkey_init(struct bkey *k) | ||
391 | { | ||
392 | *k = ZERO_KEY; | ||
393 | } | ||
394 | |||
191 | static __always_inline int64_t bkey_cmp(const struct bkey *l, | 395 | static __always_inline int64_t bkey_cmp(const struct bkey *l, |
192 | const struct bkey *r) | 396 | const struct bkey *r) |
193 | { | 397 | { |
@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, | |||
196 | : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); | 400 | : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); |
197 | } | 401 | } |
198 | 402 | ||
403 | void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||
404 | unsigned); | ||
405 | bool __bch_cut_front(const struct bkey *, struct bkey *); | ||
406 | bool __bch_cut_back(const struct bkey *, struct bkey *); | ||
407 | |||
408 | static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | ||
409 | { | ||
410 | BUG_ON(bkey_cmp(where, k) > 0); | ||
411 | return __bch_cut_front(where, k); | ||
412 | } | ||
413 | |||
414 | static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | ||
415 | { | ||
416 | BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||
417 | return __bch_cut_back(where, k); | ||
418 | } | ||
419 | |||
420 | #define PRECEDING_KEY(_k) \ | ||
421 | ({ \ | ||
422 | struct bkey *_ret = NULL; \ | ||
423 | \ | ||
424 | if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ | ||
425 | _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ | ||
426 | \ | ||
427 | if (!_ret->low) \ | ||
428 | _ret->high--; \ | ||
429 | _ret->low--; \ | ||
430 | } \ | ||
431 | \ | ||
432 | _ret; \ | ||
433 | }) | ||
434 | |||
435 | static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) | ||
436 | { | ||
437 | return b->ops->key_invalid(b, k); | ||
438 | } | ||
439 | |||
440 | static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) | ||
441 | { | ||
442 | return b->ops->key_bad(b, k); | ||
443 | } | ||
444 | |||
445 | static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, | ||
446 | size_t size, const struct bkey *k) | ||
447 | { | ||
448 | return b->ops->key_to_text(buf, size, k); | ||
449 | } | ||
450 | |||
451 | static inline bool bch_bkey_equal_header(const struct bkey *l, | ||
452 | const struct bkey *r) | ||
453 | { | ||
454 | return (KEY_DIRTY(l) == KEY_DIRTY(r) && | ||
455 | KEY_PTRS(l) == KEY_PTRS(r) && | ||
456 | KEY_CSUM(l) == KEY_CSUM(l)); | ||
457 | } | ||
458 | |||
199 | /* Keylists */ | 459 | /* Keylists */ |
200 | 460 | ||
201 | struct keylist { | 461 | struct keylist { |
@@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l) | |||
257 | 517 | ||
258 | struct bkey *bch_keylist_pop(struct keylist *); | 518 | struct bkey *bch_keylist_pop(struct keylist *); |
259 | void bch_keylist_pop_front(struct keylist *); | 519 | void bch_keylist_pop_front(struct keylist *); |
260 | int bch_keylist_realloc(struct keylist *, int, struct cache_set *); | 520 | int __bch_keylist_realloc(struct keylist *, unsigned); |
261 | |||
262 | void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||
263 | unsigned); | ||
264 | bool __bch_cut_front(const struct bkey *, struct bkey *); | ||
265 | bool __bch_cut_back(const struct bkey *, struct bkey *); | ||
266 | 521 | ||
267 | static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | 522 | /* Debug stuff */ |
268 | { | ||
269 | BUG_ON(bkey_cmp(where, k) > 0); | ||
270 | return __bch_cut_front(where, k); | ||
271 | } | ||
272 | 523 | ||
273 | static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | 524 | #ifdef CONFIG_BCACHE_DEBUG |
274 | { | ||
275 | BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||
276 | return __bch_cut_back(where, k); | ||
277 | } | ||
278 | |||
279 | const char *bch_ptr_status(struct cache_set *, const struct bkey *); | ||
280 | bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); | ||
281 | bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); | ||
282 | |||
283 | bool bch_ptr_bad(struct btree *, const struct bkey *); | ||
284 | |||
285 | static inline uint8_t gen_after(uint8_t a, uint8_t b) | ||
286 | { | ||
287 | uint8_t r = a - b; | ||
288 | return r > 128U ? 0 : r; | ||
289 | } | ||
290 | |||
291 | static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, | ||
292 | unsigned i) | ||
293 | { | ||
294 | return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); | ||
295 | } | ||
296 | |||
297 | static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||
298 | unsigned i) | ||
299 | { | ||
300 | return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||
301 | } | ||
302 | |||
303 | |||
304 | typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); | ||
305 | |||
306 | struct bkey *bch_btree_iter_next(struct btree_iter *); | ||
307 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||
308 | struct btree *, ptr_filter_fn); | ||
309 | |||
310 | void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||
311 | struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, | ||
312 | struct bkey *, struct bset_tree *); | ||
313 | |||
314 | /* 32 bits total: */ | ||
315 | #define BKEY_MID_BITS 3 | ||
316 | #define BKEY_EXPONENT_BITS 7 | ||
317 | #define BKEY_MANTISSA_BITS 22 | ||
318 | #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) | ||
319 | |||
320 | struct bkey_float { | ||
321 | unsigned exponent:BKEY_EXPONENT_BITS; | ||
322 | unsigned m:BKEY_MID_BITS; | ||
323 | unsigned mantissa:BKEY_MANTISSA_BITS; | ||
324 | } __packed; | ||
325 | |||
326 | /* | ||
327 | * BSET_CACHELINE was originally intended to match the hardware cacheline size - | ||
328 | * it used to be 64, but I realized the lookup code would touch slightly less | ||
329 | * memory if it was 128. | ||
330 | * | ||
331 | * It definites the number of bytes (in struct bset) per struct bkey_float in | ||
332 | * the auxiliar search tree - when we're done searching the bset_float tree we | ||
333 | * have this many bytes left that we do a linear search over. | ||
334 | * | ||
335 | * Since (after level 5) every level of the bset_tree is on a new cacheline, | ||
336 | * we're touching one fewer cacheline in the bset tree in exchange for one more | ||
337 | * cacheline in the linear search - but the linear search might stop before it | ||
338 | * gets to the second cacheline. | ||
339 | */ | ||
340 | |||
341 | #define BSET_CACHELINE 128 | ||
342 | #define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) | ||
343 | 525 | ||
344 | #define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) | 526 | int __bch_count_data(struct btree_keys *); |
345 | #define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) | 527 | void __bch_check_keys(struct btree_keys *, const char *, ...); |
528 | void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); | ||
529 | void bch_dump_bucket(struct btree_keys *); | ||
346 | 530 | ||
347 | void bch_bset_init_next(struct btree *); | 531 | #else |
348 | 532 | ||
349 | void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); | 533 | static inline int __bch_count_data(struct btree_keys *b) { return -1; } |
350 | void bch_bset_fix_lookup_table(struct btree *, struct bkey *); | 534 | static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} |
535 | static inline void bch_dump_bucket(struct btree_keys *b) {} | ||
536 | void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); | ||
351 | 537 | ||
352 | struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, | 538 | #endif |
353 | const struct bkey *); | ||
354 | 539 | ||
355 | /* | 540 | static inline bool btree_keys_expensive_checks(struct btree_keys *b) |
356 | * Returns the first key that is strictly greater than search | ||
357 | */ | ||
358 | static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, | ||
359 | const struct bkey *search) | ||
360 | { | 541 | { |
361 | return search ? __bch_bset_search(b, t, search) : t->data->start; | 542 | #ifdef CONFIG_BCACHE_DEBUG |
543 | return *b->expensive_debug_checks; | ||
544 | #else | ||
545 | return false; | ||
546 | #endif | ||
362 | } | 547 | } |
363 | 548 | ||
364 | #define PRECEDING_KEY(_k) \ | 549 | static inline int bch_count_data(struct btree_keys *b) |
365 | ({ \ | ||
366 | struct bkey *_ret = NULL; \ | ||
367 | \ | ||
368 | if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ | ||
369 | _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ | ||
370 | \ | ||
371 | if (!_ret->low) \ | ||
372 | _ret->high--; \ | ||
373 | _ret->low--; \ | ||
374 | } \ | ||
375 | \ | ||
376 | _ret; \ | ||
377 | }) | ||
378 | |||
379 | bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); | ||
380 | void bch_btree_sort_lazy(struct btree *); | ||
381 | void bch_btree_sort_into(struct btree *, struct btree *); | ||
382 | void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); | ||
383 | void bch_btree_sort_partial(struct btree *, unsigned); | ||
384 | |||
385 | static inline void bch_btree_sort(struct btree *b) | ||
386 | { | 550 | { |
387 | bch_btree_sort_partial(b, 0); | 551 | return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; |
388 | } | 552 | } |
389 | 553 | ||
390 | int bch_bset_print_stats(struct cache_set *, char *); | 554 | #define bch_check_keys(b, ...) \ |
555 | do { \ | ||
556 | if (btree_keys_expensive_checks(b)) \ | ||
557 | __bch_check_keys(b, __VA_ARGS__); \ | ||
558 | } while (0) | ||
391 | 559 | ||
392 | #endif | 560 | #endif |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 31bb53fcc67a..5f9c2a665ca5 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -23,7 +23,7 @@ | |||
23 | #include "bcache.h" | 23 | #include "bcache.h" |
24 | #include "btree.h" | 24 | #include "btree.h" |
25 | #include "debug.h" | 25 | #include "debug.h" |
26 | #include "writeback.h" | 26 | #include "extents.h" |
27 | 27 | ||
28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
29 | #include <linux/bitops.h> | 29 | #include <linux/bitops.h> |
@@ -89,13 +89,6 @@ | |||
89 | * Test module load/unload | 89 | * Test module load/unload |
90 | */ | 90 | */ |
91 | 91 | ||
92 | enum { | ||
93 | BTREE_INSERT_STATUS_INSERT, | ||
94 | BTREE_INSERT_STATUS_BACK_MERGE, | ||
95 | BTREE_INSERT_STATUS_OVERWROTE, | ||
96 | BTREE_INSERT_STATUS_FRONT_MERGE, | ||
97 | }; | ||
98 | |||
99 | #define MAX_NEED_GC 64 | 92 | #define MAX_NEED_GC 64 |
100 | #define MAX_SAVE_PRIO 72 | 93 | #define MAX_SAVE_PRIO 72 |
101 | 94 | ||
@@ -106,14 +99,6 @@ enum { | |||
106 | 99 | ||
107 | static struct workqueue_struct *btree_io_wq; | 100 | static struct workqueue_struct *btree_io_wq; |
108 | 101 | ||
109 | static inline bool should_split(struct btree *b) | ||
110 | { | ||
111 | struct bset *i = write_block(b); | ||
112 | return b->written >= btree_blocks(b) || | ||
113 | (b->written + __set_blocks(i, i->keys + 15, b->c) | ||
114 | > btree_blocks(b)); | ||
115 | } | ||
116 | |||
117 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | 102 | #define insert_lock(s, b) ((b)->level <= (s)->lock) |
118 | 103 | ||
119 | /* | 104 | /* |
@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b) | |||
167 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | 152 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ |
168 | } \ | 153 | } \ |
169 | rw_unlock(_w, _b); \ | 154 | rw_unlock(_w, _b); \ |
155 | if (_r == -EINTR) \ | ||
156 | schedule(); \ | ||
170 | bch_cannibalize_unlock(c); \ | 157 | bch_cannibalize_unlock(c); \ |
171 | if (_r == -ENOSPC) { \ | 158 | if (_r == -ENOSPC) { \ |
172 | wait_event((c)->try_wait, \ | 159 | wait_event((c)->try_wait, \ |
@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b) | |||
175 | } \ | 162 | } \ |
176 | } while (_r == -EINTR); \ | 163 | } while (_r == -EINTR); \ |
177 | \ | 164 | \ |
165 | finish_wait(&(c)->bucket_wait, &(op)->wait); \ | ||
178 | _r; \ | 166 | _r; \ |
179 | }) | 167 | }) |
180 | 168 | ||
169 | static inline struct bset *write_block(struct btree *b) | ||
170 | { | ||
171 | return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); | ||
172 | } | ||
173 | |||
181 | /* Btree key manipulation */ | 174 | /* Btree key manipulation */ |
182 | 175 | ||
183 | void bkey_put(struct cache_set *c, struct bkey *k) | 176 | void bkey_put(struct cache_set *c, struct bkey *k) |
@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k) | |||
194 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | 187 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) |
195 | { | 188 | { |
196 | uint64_t crc = b->key.ptr[0]; | 189 | uint64_t crc = b->key.ptr[0]; |
197 | void *data = (void *) i + 8, *end = end(i); | 190 | void *data = (void *) i + 8, *end = bset_bkey_last(i); |
198 | 191 | ||
199 | crc = bch_crc64_update(crc, data, end - data); | 192 | crc = bch_crc64_update(crc, data, end - data); |
200 | return crc ^ 0xffffffffffffffffULL; | 193 | return crc ^ 0xffffffffffffffffULL; |
201 | } | 194 | } |
202 | 195 | ||
203 | static void bch_btree_node_read_done(struct btree *b) | 196 | void bch_btree_node_read_done(struct btree *b) |
204 | { | 197 | { |
205 | const char *err = "bad btree header"; | 198 | const char *err = "bad btree header"; |
206 | struct bset *i = b->sets[0].data; | 199 | struct bset *i = btree_bset_first(b); |
207 | struct btree_iter *iter; | 200 | struct btree_iter *iter; |
208 | 201 | ||
209 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); | 202 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); |
@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b) | |||
211 | iter->used = 0; | 204 | iter->used = 0; |
212 | 205 | ||
213 | #ifdef CONFIG_BCACHE_DEBUG | 206 | #ifdef CONFIG_BCACHE_DEBUG |
214 | iter->b = b; | 207 | iter->b = &b->keys; |
215 | #endif | 208 | #endif |
216 | 209 | ||
217 | if (!i->seq) | 210 | if (!i->seq) |
218 | goto err; | 211 | goto err; |
219 | 212 | ||
220 | for (; | 213 | for (; |
221 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; | 214 | b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; |
222 | i = write_block(b)) { | 215 | i = write_block(b)) { |
223 | err = "unsupported bset version"; | 216 | err = "unsupported bset version"; |
224 | if (i->version > BCACHE_BSET_VERSION) | 217 | if (i->version > BCACHE_BSET_VERSION) |
225 | goto err; | 218 | goto err; |
226 | 219 | ||
227 | err = "bad btree header"; | 220 | err = "bad btree header"; |
228 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) | 221 | if (b->written + set_blocks(i, block_bytes(b->c)) > |
222 | btree_blocks(b)) | ||
229 | goto err; | 223 | goto err; |
230 | 224 | ||
231 | err = "bad magic"; | 225 | err = "bad magic"; |
@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b) | |||
245 | } | 239 | } |
246 | 240 | ||
247 | err = "empty set"; | 241 | err = "empty set"; |
248 | if (i != b->sets[0].data && !i->keys) | 242 | if (i != b->keys.set[0].data && !i->keys) |
249 | goto err; | 243 | goto err; |
250 | 244 | ||
251 | bch_btree_iter_push(iter, i->start, end(i)); | 245 | bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); |
252 | 246 | ||
253 | b->written += set_blocks(i, b->c); | 247 | b->written += set_blocks(i, block_bytes(b->c)); |
254 | } | 248 | } |
255 | 249 | ||
256 | err = "corrupted btree"; | 250 | err = "corrupted btree"; |
257 | for (i = write_block(b); | 251 | for (i = write_block(b); |
258 | index(i, b) < btree_blocks(b); | 252 | bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); |
259 | i = ((void *) i) + block_bytes(b->c)) | 253 | i = ((void *) i) + block_bytes(b->c)) |
260 | if (i->seq == b->sets[0].data->seq) | 254 | if (i->seq == b->keys.set[0].data->seq) |
261 | goto err; | 255 | goto err; |
262 | 256 | ||
263 | bch_btree_sort_and_fix_extents(b, iter); | 257 | bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); |
264 | 258 | ||
265 | i = b->sets[0].data; | 259 | i = b->keys.set[0].data; |
266 | err = "short btree key"; | 260 | err = "short btree key"; |
267 | if (b->sets[0].size && | 261 | if (b->keys.set[0].size && |
268 | bkey_cmp(&b->key, &b->sets[0].end) < 0) | 262 | bkey_cmp(&b->key, &b->keys.set[0].end) < 0) |
269 | goto err; | 263 | goto err; |
270 | 264 | ||
271 | if (b->written < btree_blocks(b)) | 265 | if (b->written < btree_blocks(b)) |
272 | bch_bset_init_next(b); | 266 | bch_bset_init_next(&b->keys, write_block(b), |
267 | bset_magic(&b->c->sb)); | ||
273 | out: | 268 | out: |
274 | mempool_free(iter, b->c->fill_iter); | 269 | mempool_free(iter, b->c->fill_iter); |
275 | return; | 270 | return; |
276 | err: | 271 | err: |
277 | set_btree_node_io_error(b); | 272 | set_btree_node_io_error(b); |
278 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | 273 | bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", |
279 | err, PTR_BUCKET_NR(b->c, &b->key, 0), | 274 | err, PTR_BUCKET_NR(b->c, &b->key, 0), |
280 | index(i, b), i->keys); | 275 | bset_block_offset(b, i), i->keys); |
281 | goto out; | 276 | goto out; |
282 | } | 277 | } |
283 | 278 | ||
@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error) | |||
287 | closure_put(cl); | 282 | closure_put(cl); |
288 | } | 283 | } |
289 | 284 | ||
290 | void bch_btree_node_read(struct btree *b) | 285 | static void bch_btree_node_read(struct btree *b) |
291 | { | 286 | { |
292 | uint64_t start_time = local_clock(); | 287 | uint64_t start_time = local_clock(); |
293 | struct closure cl; | 288 | struct closure cl; |
@@ -299,11 +294,11 @@ void bch_btree_node_read(struct btree *b) | |||
299 | 294 | ||
300 | bio = bch_bbio_alloc(b->c); | 295 | bio = bch_bbio_alloc(b->c); |
301 | bio->bi_rw = REQ_META|READ_SYNC; | 296 | bio->bi_rw = REQ_META|READ_SYNC; |
302 | bio->bi_size = KEY_SIZE(&b->key) << 9; | 297 | bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; |
303 | bio->bi_end_io = btree_node_read_endio; | 298 | bio->bi_end_io = btree_node_read_endio; |
304 | bio->bi_private = &cl; | 299 | bio->bi_private = &cl; |
305 | 300 | ||
306 | bch_bio_map(bio, b->sets[0].data); | 301 | bch_bio_map(bio, b->keys.set[0].data); |
307 | 302 | ||
308 | bch_submit_bbio(bio, b->c, &b->key, 0); | 303 | bch_submit_bbio(bio, b->c, &b->key, 0); |
309 | closure_sync(&cl); | 304 | closure_sync(&cl); |
@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) | |||
340 | w->journal = NULL; | 335 | w->journal = NULL; |
341 | } | 336 | } |
342 | 337 | ||
338 | static void btree_node_write_unlock(struct closure *cl) | ||
339 | { | ||
340 | struct btree *b = container_of(cl, struct btree, io); | ||
341 | |||
342 | up(&b->io_mutex); | ||
343 | } | ||
344 | |||
343 | static void __btree_node_write_done(struct closure *cl) | 345 | static void __btree_node_write_done(struct closure *cl) |
344 | { | 346 | { |
345 | struct btree *b = container_of(cl, struct btree, io.cl); | 347 | struct btree *b = container_of(cl, struct btree, io); |
346 | struct btree_write *w = btree_prev_write(b); | 348 | struct btree_write *w = btree_prev_write(b); |
347 | 349 | ||
348 | bch_bbio_free(b->bio, b->c); | 350 | bch_bbio_free(b->bio, b->c); |
@@ -353,16 +355,16 @@ static void __btree_node_write_done(struct closure *cl) | |||
353 | queue_delayed_work(btree_io_wq, &b->work, | 355 | queue_delayed_work(btree_io_wq, &b->work, |
354 | msecs_to_jiffies(30000)); | 356 | msecs_to_jiffies(30000)); |
355 | 357 | ||
356 | closure_return(cl); | 358 | closure_return_with_destructor(cl, btree_node_write_unlock); |
357 | } | 359 | } |
358 | 360 | ||
359 | static void btree_node_write_done(struct closure *cl) | 361 | static void btree_node_write_done(struct closure *cl) |
360 | { | 362 | { |
361 | struct btree *b = container_of(cl, struct btree, io.cl); | 363 | struct btree *b = container_of(cl, struct btree, io); |
362 | struct bio_vec *bv; | 364 | struct bio_vec *bv; |
363 | int n; | 365 | int n; |
364 | 366 | ||
365 | __bio_for_each_segment(bv, b->bio, n, 0) | 367 | bio_for_each_segment_all(bv, b->bio, n) |
366 | __free_page(bv->bv_page); | 368 | __free_page(bv->bv_page); |
367 | 369 | ||
368 | __btree_node_write_done(cl); | 370 | __btree_node_write_done(cl); |
@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl) | |||
371 | static void btree_node_write_endio(struct bio *bio, int error) | 373 | static void btree_node_write_endio(struct bio *bio, int error) |
372 | { | 374 | { |
373 | struct closure *cl = bio->bi_private; | 375 | struct closure *cl = bio->bi_private; |
374 | struct btree *b = container_of(cl, struct btree, io.cl); | 376 | struct btree *b = container_of(cl, struct btree, io); |
375 | 377 | ||
376 | if (error) | 378 | if (error) |
377 | set_btree_node_io_error(b); | 379 | set_btree_node_io_error(b); |
@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error) | |||
382 | 384 | ||
383 | static void do_btree_node_write(struct btree *b) | 385 | static void do_btree_node_write(struct btree *b) |
384 | { | 386 | { |
385 | struct closure *cl = &b->io.cl; | 387 | struct closure *cl = &b->io; |
386 | struct bset *i = b->sets[b->nsets].data; | 388 | struct bset *i = btree_bset_last(b); |
387 | BKEY_PADDED(key) k; | 389 | BKEY_PADDED(key) k; |
388 | 390 | ||
389 | i->version = BCACHE_BSET_VERSION; | 391 | i->version = BCACHE_BSET_VERSION; |
@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b) | |||
395 | b->bio->bi_end_io = btree_node_write_endio; | 397 | b->bio->bi_end_io = btree_node_write_endio; |
396 | b->bio->bi_private = cl; | 398 | b->bio->bi_private = cl; |
397 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; | 399 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; |
398 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | 400 | b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); |
399 | bch_bio_map(b->bio, i); | 401 | bch_bio_map(b->bio, i); |
400 | 402 | ||
401 | /* | 403 | /* |
@@ -414,14 +416,15 @@ static void do_btree_node_write(struct btree *b) | |||
414 | */ | 416 | */ |
415 | 417 | ||
416 | bkey_copy(&k.key, &b->key); | 418 | bkey_copy(&k.key, &b->key); |
417 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | 419 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + |
420 | bset_sector_offset(&b->keys, i)); | ||
418 | 421 | ||
419 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { | 422 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { |
420 | int j; | 423 | int j; |
421 | struct bio_vec *bv; | 424 | struct bio_vec *bv; |
422 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | 425 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); |
423 | 426 | ||
424 | bio_for_each_segment(bv, b->bio, j) | 427 | bio_for_each_segment_all(bv, b->bio, j) |
425 | memcpy(page_address(bv->bv_page), | 428 | memcpy(page_address(bv->bv_page), |
426 | base + j * PAGE_SIZE, PAGE_SIZE); | 429 | base + j * PAGE_SIZE, PAGE_SIZE); |
427 | 430 | ||
@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b) | |||
435 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 438 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
436 | 439 | ||
437 | closure_sync(cl); | 440 | closure_sync(cl); |
438 | __btree_node_write_done(cl); | 441 | continue_at_nobarrier(cl, __btree_node_write_done, NULL); |
439 | } | 442 | } |
440 | } | 443 | } |
441 | 444 | ||
442 | void bch_btree_node_write(struct btree *b, struct closure *parent) | 445 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
443 | { | 446 | { |
444 | struct bset *i = b->sets[b->nsets].data; | 447 | struct bset *i = btree_bset_last(b); |
445 | 448 | ||
446 | trace_bcache_btree_write(b); | 449 | trace_bcache_btree_write(b); |
447 | 450 | ||
448 | BUG_ON(current->bio_list); | 451 | BUG_ON(current->bio_list); |
449 | BUG_ON(b->written >= btree_blocks(b)); | 452 | BUG_ON(b->written >= btree_blocks(b)); |
450 | BUG_ON(b->written && !i->keys); | 453 | BUG_ON(b->written && !i->keys); |
451 | BUG_ON(b->sets->data->seq != i->seq); | 454 | BUG_ON(btree_bset_first(b)->seq != i->seq); |
452 | bch_check_keys(b, "writing"); | 455 | bch_check_keys(&b->keys, "writing"); |
453 | 456 | ||
454 | cancel_delayed_work(&b->work); | 457 | cancel_delayed_work(&b->work); |
455 | 458 | ||
456 | /* If caller isn't waiting for write, parent refcount is cache set */ | 459 | /* If caller isn't waiting for write, parent refcount is cache set */ |
457 | closure_lock(&b->io, parent ?: &b->c->cl); | 460 | down(&b->io_mutex); |
461 | closure_init(&b->io, parent ?: &b->c->cl); | ||
458 | 462 | ||
459 | clear_bit(BTREE_NODE_dirty, &b->flags); | 463 | clear_bit(BTREE_NODE_dirty, &b->flags); |
460 | change_bit(BTREE_NODE_write_idx, &b->flags); | 464 | change_bit(BTREE_NODE_write_idx, &b->flags); |
461 | 465 | ||
462 | do_btree_node_write(b); | 466 | do_btree_node_write(b); |
463 | 467 | ||
464 | b->written += set_blocks(i, b->c); | 468 | atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, |
465 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | ||
466 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | 469 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); |
467 | 470 | ||
468 | bch_btree_sort_lazy(b); | 471 | b->written += set_blocks(i, block_bytes(b->c)); |
472 | |||
473 | /* If not a leaf node, always sort */ | ||
474 | if (b->level && b->keys.nsets) | ||
475 | bch_btree_sort(&b->keys, &b->c->sort); | ||
476 | else | ||
477 | bch_btree_sort_lazy(&b->keys, &b->c->sort); | ||
478 | |||
479 | /* | ||
480 | * do verify if there was more than one set initially (i.e. we did a | ||
481 | * sort) and we sorted down to a single set: | ||
482 | */ | ||
483 | if (i != b->keys.set->data && !b->keys.nsets) | ||
484 | bch_btree_verify(b); | ||
469 | 485 | ||
470 | if (b->written < btree_blocks(b)) | 486 | if (b->written < btree_blocks(b)) |
471 | bch_bset_init_next(b); | 487 | bch_bset_init_next(&b->keys, write_block(b), |
488 | bset_magic(&b->c->sb)); | ||
472 | } | 489 | } |
473 | 490 | ||
474 | static void bch_btree_node_write_sync(struct btree *b) | 491 | static void bch_btree_node_write_sync(struct btree *b) |
@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w) | |||
493 | 510 | ||
494 | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | 511 | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) |
495 | { | 512 | { |
496 | struct bset *i = b->sets[b->nsets].data; | 513 | struct bset *i = btree_bset_last(b); |
497 | struct btree_write *w = btree_current_write(b); | 514 | struct btree_write *w = btree_current_write(b); |
498 | 515 | ||
499 | BUG_ON(!b->written); | 516 | BUG_ON(!b->written); |
@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | |||
528 | * mca -> memory cache | 545 | * mca -> memory cache |
529 | */ | 546 | */ |
530 | 547 | ||
531 | static void mca_reinit(struct btree *b) | ||
532 | { | ||
533 | unsigned i; | ||
534 | |||
535 | b->flags = 0; | ||
536 | b->written = 0; | ||
537 | b->nsets = 0; | ||
538 | |||
539 | for (i = 0; i < MAX_BSETS; i++) | ||
540 | b->sets[i].size = 0; | ||
541 | /* | ||
542 | * Second loop starts at 1 because b->sets[0]->data is the memory we | ||
543 | * allocated | ||
544 | */ | ||
545 | for (i = 1; i < MAX_BSETS; i++) | ||
546 | b->sets[i].data = NULL; | ||
547 | } | ||
548 | |||
549 | #define mca_reserve(c) (((c->root && c->root->level) \ | 548 | #define mca_reserve(c) (((c->root && c->root->level) \ |
550 | ? c->root->level : 1) * 8 + 16) | 549 | ? c->root->level : 1) * 8 + 16) |
551 | #define mca_can_free(c) \ | 550 | #define mca_can_free(c) \ |
@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b) | |||
553 | 552 | ||
554 | static void mca_data_free(struct btree *b) | 553 | static void mca_data_free(struct btree *b) |
555 | { | 554 | { |
556 | struct bset_tree *t = b->sets; | 555 | BUG_ON(b->io_mutex.count != 1); |
557 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | ||
558 | 556 | ||
559 | if (bset_prev_bytes(b) < PAGE_SIZE) | 557 | bch_btree_keys_free(&b->keys); |
560 | kfree(t->prev); | ||
561 | else | ||
562 | free_pages((unsigned long) t->prev, | ||
563 | get_order(bset_prev_bytes(b))); | ||
564 | 558 | ||
565 | if (bset_tree_bytes(b) < PAGE_SIZE) | ||
566 | kfree(t->tree); | ||
567 | else | ||
568 | free_pages((unsigned long) t->tree, | ||
569 | get_order(bset_tree_bytes(b))); | ||
570 | |||
571 | free_pages((unsigned long) t->data, b->page_order); | ||
572 | |||
573 | t->prev = NULL; | ||
574 | t->tree = NULL; | ||
575 | t->data = NULL; | ||
576 | list_move(&b->list, &b->c->btree_cache_freed); | ||
577 | b->c->bucket_cache_used--; | 559 | b->c->bucket_cache_used--; |
560 | list_move(&b->list, &b->c->btree_cache_freed); | ||
578 | } | 561 | } |
579 | 562 | ||
580 | static void mca_bucket_free(struct btree *b) | 563 | static void mca_bucket_free(struct btree *b) |
@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k) | |||
593 | 576 | ||
594 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | 577 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) |
595 | { | 578 | { |
596 | struct bset_tree *t = b->sets; | 579 | if (!bch_btree_keys_alloc(&b->keys, |
597 | BUG_ON(t->data); | 580 | max_t(unsigned, |
598 | 581 | ilog2(b->c->btree_pages), | |
599 | b->page_order = max_t(unsigned, | 582 | btree_order(k)), |
600 | ilog2(b->c->btree_pages), | 583 | gfp)) { |
601 | btree_order(k)); | 584 | b->c->bucket_cache_used++; |
602 | 585 | list_move(&b->list, &b->c->btree_cache); | |
603 | t->data = (void *) __get_free_pages(gfp, b->page_order); | 586 | } else { |
604 | if (!t->data) | 587 | list_move(&b->list, &b->c->btree_cache_freed); |
605 | goto err; | 588 | } |
606 | |||
607 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | ||
608 | ? kmalloc(bset_tree_bytes(b), gfp) | ||
609 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | ||
610 | if (!t->tree) | ||
611 | goto err; | ||
612 | |||
613 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | ||
614 | ? kmalloc(bset_prev_bytes(b), gfp) | ||
615 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | ||
616 | if (!t->prev) | ||
617 | goto err; | ||
618 | |||
619 | list_move(&b->list, &b->c->btree_cache); | ||
620 | b->c->bucket_cache_used++; | ||
621 | return; | ||
622 | err: | ||
623 | mca_data_free(b); | ||
624 | } | 589 | } |
625 | 590 | ||
626 | static struct btree *mca_bucket_alloc(struct cache_set *c, | 591 | static struct btree *mca_bucket_alloc(struct cache_set *c, |
@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, | |||
635 | INIT_LIST_HEAD(&b->list); | 600 | INIT_LIST_HEAD(&b->list); |
636 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); | 601 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
637 | b->c = c; | 602 | b->c = c; |
638 | closure_init_unlocked(&b->io); | 603 | sema_init(&b->io_mutex, 1); |
639 | 604 | ||
640 | mca_data_alloc(b, k, gfp); | 605 | mca_data_alloc(b, k, gfp); |
641 | return b; | 606 | return b; |
@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) | |||
651 | if (!down_write_trylock(&b->lock)) | 616 | if (!down_write_trylock(&b->lock)) |
652 | return -ENOMEM; | 617 | return -ENOMEM; |
653 | 618 | ||
654 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | 619 | BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); |
655 | 620 | ||
656 | if (b->page_order < min_order || | 621 | if (b->keys.page_order < min_order) |
657 | (!flush && | 622 | goto out_unlock; |
658 | (btree_node_dirty(b) || | 623 | |
659 | atomic_read(&b->io.cl.remaining) != -1))) { | 624 | if (!flush) { |
660 | rw_unlock(true, b); | 625 | if (btree_node_dirty(b)) |
661 | return -ENOMEM; | 626 | goto out_unlock; |
627 | |||
628 | if (down_trylock(&b->io_mutex)) | ||
629 | goto out_unlock; | ||
630 | up(&b->io_mutex); | ||
662 | } | 631 | } |
663 | 632 | ||
664 | if (btree_node_dirty(b)) | 633 | if (btree_node_dirty(b)) |
665 | bch_btree_node_write_sync(b); | 634 | bch_btree_node_write_sync(b); |
666 | 635 | ||
667 | /* wait for any in flight btree write */ | 636 | /* wait for any in flight btree write */ |
668 | closure_wait_event(&b->io.wait, &cl, | 637 | down(&b->io_mutex); |
669 | atomic_read(&b->io.cl.remaining) == -1); | 638 | up(&b->io_mutex); |
670 | 639 | ||
671 | return 0; | 640 | return 0; |
641 | out_unlock: | ||
642 | rw_unlock(true, b); | ||
643 | return -ENOMEM; | ||
672 | } | 644 | } |
673 | 645 | ||
674 | static unsigned long bch_mca_scan(struct shrinker *shrink, | 646 | static unsigned long bch_mca_scan(struct shrinker *shrink, |
@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, | |||
714 | } | 686 | } |
715 | } | 687 | } |
716 | 688 | ||
717 | /* | ||
718 | * Can happen right when we first start up, before we've read in any | ||
719 | * btree nodes | ||
720 | */ | ||
721 | if (list_empty(&c->btree_cache)) | ||
722 | goto out; | ||
723 | |||
724 | for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { | 689 | for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { |
690 | if (list_empty(&c->btree_cache)) | ||
691 | goto out; | ||
692 | |||
725 | b = list_first_entry(&c->btree_cache, struct btree, list); | 693 | b = list_first_entry(&c->btree_cache, struct btree, list); |
726 | list_rotate_left(&c->btree_cache); | 694 | list_rotate_left(&c->btree_cache); |
727 | 695 | ||
@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c) | |||
767 | #ifdef CONFIG_BCACHE_DEBUG | 735 | #ifdef CONFIG_BCACHE_DEBUG |
768 | if (c->verify_data) | 736 | if (c->verify_data) |
769 | list_move(&c->verify_data->list, &c->btree_cache); | 737 | list_move(&c->verify_data->list, &c->btree_cache); |
738 | |||
739 | free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); | ||
770 | #endif | 740 | #endif |
771 | 741 | ||
772 | list_splice(&c->btree_cache_freeable, | 742 | list_splice(&c->btree_cache_freeable, |
@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c) | |||
807 | #ifdef CONFIG_BCACHE_DEBUG | 777 | #ifdef CONFIG_BCACHE_DEBUG |
808 | mutex_init(&c->verify_lock); | 778 | mutex_init(&c->verify_lock); |
809 | 779 | ||
780 | c->verify_ondisk = (void *) | ||
781 | __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); | ||
782 | |||
810 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | 783 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); |
811 | 784 | ||
812 | if (c->verify_data && | 785 | if (c->verify_data && |
813 | c->verify_data->sets[0].data) | 786 | c->verify_data->keys.set->data) |
814 | list_del_init(&c->verify_data->list); | 787 | list_del_init(&c->verify_data->list); |
815 | else | 788 | else |
816 | c->verify_data = NULL; | 789 | c->verify_data = NULL; |
@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) | |||
908 | list_for_each_entry(b, &c->btree_cache_freed, list) | 881 | list_for_each_entry(b, &c->btree_cache_freed, list) |
909 | if (!mca_reap(b, 0, false)) { | 882 | if (!mca_reap(b, 0, false)) { |
910 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | 883 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); |
911 | if (!b->sets[0].data) | 884 | if (!b->keys.set[0].data) |
912 | goto err; | 885 | goto err; |
913 | else | 886 | else |
914 | goto out; | 887 | goto out; |
@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) | |||
919 | goto err; | 892 | goto err; |
920 | 893 | ||
921 | BUG_ON(!down_write_trylock(&b->lock)); | 894 | BUG_ON(!down_write_trylock(&b->lock)); |
922 | if (!b->sets->data) | 895 | if (!b->keys.set->data) |
923 | goto err; | 896 | goto err; |
924 | out: | 897 | out: |
925 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | 898 | BUG_ON(b->io_mutex.count != 1); |
926 | 899 | ||
927 | bkey_copy(&b->key, k); | 900 | bkey_copy(&b->key, k); |
928 | list_move(&b->list, &c->btree_cache); | 901 | list_move(&b->list, &c->btree_cache); |
@@ -930,10 +903,17 @@ out: | |||
930 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | 903 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); |
931 | 904 | ||
932 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | 905 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); |
933 | b->level = level; | ||
934 | b->parent = (void *) ~0UL; | 906 | b->parent = (void *) ~0UL; |
907 | b->flags = 0; | ||
908 | b->written = 0; | ||
909 | b->level = level; | ||
935 | 910 | ||
936 | mca_reinit(b); | 911 | if (!b->level) |
912 | bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, | ||
913 | &b->c->expensive_debug_checks); | ||
914 | else | ||
915 | bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, | ||
916 | &b->c->expensive_debug_checks); | ||
937 | 917 | ||
938 | return b; | 918 | return b; |
939 | err: | 919 | err: |
@@ -994,13 +974,13 @@ retry: | |||
994 | 974 | ||
995 | b->accessed = 1; | 975 | b->accessed = 1; |
996 | 976 | ||
997 | for (; i <= b->nsets && b->sets[i].size; i++) { | 977 | for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { |
998 | prefetch(b->sets[i].tree); | 978 | prefetch(b->keys.set[i].tree); |
999 | prefetch(b->sets[i].data); | 979 | prefetch(b->keys.set[i].data); |
1000 | } | 980 | } |
1001 | 981 | ||
1002 | for (; i <= b->nsets; i++) | 982 | for (; i <= b->keys.nsets; i++) |
1003 | prefetch(b->sets[i].data); | 983 | prefetch(b->keys.set[i].data); |
1004 | 984 | ||
1005 | if (btree_node_io_error(b)) { | 985 | if (btree_node_io_error(b)) { |
1006 | rw_unlock(write, b); | 986 | rw_unlock(write, b); |
@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) | |||
1063 | 1043 | ||
1064 | mutex_lock(&c->bucket_lock); | 1044 | mutex_lock(&c->bucket_lock); |
1065 | retry: | 1045 | retry: |
1066 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) | 1046 | if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) |
1067 | goto err; | 1047 | goto err; |
1068 | 1048 | ||
1069 | bkey_put(c, &k.key); | 1049 | bkey_put(c, &k.key); |
@@ -1080,7 +1060,7 @@ retry: | |||
1080 | } | 1060 | } |
1081 | 1061 | ||
1082 | b->accessed = 1; | 1062 | b->accessed = 1; |
1083 | bch_bset_init_next(b); | 1063 | bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); |
1084 | 1064 | ||
1085 | mutex_unlock(&c->bucket_lock); | 1065 | mutex_unlock(&c->bucket_lock); |
1086 | 1066 | ||
@@ -1098,8 +1078,10 @@ err: | |||
1098 | static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) | 1078 | static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) |
1099 | { | 1079 | { |
1100 | struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); | 1080 | struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); |
1101 | if (!IS_ERR_OR_NULL(n)) | 1081 | if (!IS_ERR_OR_NULL(n)) { |
1102 | bch_btree_sort_into(b, n); | 1082 | bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); |
1083 | bkey_copy_key(&n->key, &b->key); | ||
1084 | } | ||
1103 | 1085 | ||
1104 | return n; | 1086 | return n; |
1105 | } | 1087 | } |
@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) | |||
1120 | atomic_inc(&b->c->prio_blocked); | 1102 | atomic_inc(&b->c->prio_blocked); |
1121 | } | 1103 | } |
1122 | 1104 | ||
1105 | static int btree_check_reserve(struct btree *b, struct btree_op *op) | ||
1106 | { | ||
1107 | struct cache_set *c = b->c; | ||
1108 | struct cache *ca; | ||
1109 | unsigned i, reserve = c->root->level * 2 + 1; | ||
1110 | int ret = 0; | ||
1111 | |||
1112 | mutex_lock(&c->bucket_lock); | ||
1113 | |||
1114 | for_each_cache(ca, c, i) | ||
1115 | if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { | ||
1116 | if (op) | ||
1117 | prepare_to_wait(&c->bucket_wait, &op->wait, | ||
1118 | TASK_UNINTERRUPTIBLE); | ||
1119 | ret = -EINTR; | ||
1120 | break; | ||
1121 | } | ||
1122 | |||
1123 | mutex_unlock(&c->bucket_lock); | ||
1124 | return ret; | ||
1125 | } | ||
1126 | |||
1123 | /* Garbage collection */ | 1127 | /* Garbage collection */ |
1124 | 1128 | ||
1125 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | 1129 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) |
@@ -1163,7 +1167,7 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | |||
1163 | /* guard against overflow */ | 1167 | /* guard against overflow */ |
1164 | SET_GC_SECTORS_USED(g, min_t(unsigned, | 1168 | SET_GC_SECTORS_USED(g, min_t(unsigned, |
1165 | GC_SECTORS_USED(g) + KEY_SIZE(k), | 1169 | GC_SECTORS_USED(g) + KEY_SIZE(k), |
1166 | (1 << 14) - 1)); | 1170 | MAX_GC_SECTORS_USED)); |
1167 | 1171 | ||
1168 | BUG_ON(!GC_SECTORS_USED(g)); | 1172 | BUG_ON(!GC_SECTORS_USED(g)); |
1169 | } | 1173 | } |
@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) | |||
1183 | 1187 | ||
1184 | gc->nodes++; | 1188 | gc->nodes++; |
1185 | 1189 | ||
1186 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1190 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { |
1187 | stale = max(stale, btree_mark_key(b, k)); | 1191 | stale = max(stale, btree_mark_key(b, k)); |
1188 | keys++; | 1192 | keys++; |
1189 | 1193 | ||
1190 | if (bch_ptr_bad(b, k)) | 1194 | if (bch_ptr_bad(&b->keys, k)) |
1191 | continue; | 1195 | continue; |
1192 | 1196 | ||
1193 | gc->key_bytes += bkey_u64s(k); | 1197 | gc->key_bytes += bkey_u64s(k); |
@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) | |||
1197 | gc->data += KEY_SIZE(k); | 1201 | gc->data += KEY_SIZE(k); |
1198 | } | 1202 | } |
1199 | 1203 | ||
1200 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1204 | for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) |
1201 | btree_bug_on(t->size && | 1205 | btree_bug_on(t->size && |
1202 | bset_written(b, t) && | 1206 | bset_written(&b->keys, t) && |
1203 | bkey_cmp(&b->key, &t->end) < 0, | 1207 | bkey_cmp(&b->key, &t->end) < 0, |
1204 | b, "found short btree key in gc"); | 1208 | b, "found short btree key in gc"); |
1205 | 1209 | ||
@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1243 | blocks = btree_default_blocks(b->c) * 2 / 3; | 1247 | blocks = btree_default_blocks(b->c) * 2 / 3; |
1244 | 1248 | ||
1245 | if (nodes < 2 || | 1249 | if (nodes < 2 || |
1246 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | 1250 | __set_blocks(b->keys.set[0].data, keys, |
1251 | block_bytes(b->c)) > blocks * (nodes - 1)) | ||
1247 | return 0; | 1252 | return 0; |
1248 | 1253 | ||
1249 | for (i = 0; i < nodes; i++) { | 1254 | for (i = 0; i < nodes; i++) { |
@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1253 | } | 1258 | } |
1254 | 1259 | ||
1255 | for (i = nodes - 1; i > 0; --i) { | 1260 | for (i = nodes - 1; i > 0; --i) { |
1256 | struct bset *n1 = new_nodes[i]->sets->data; | 1261 | struct bset *n1 = btree_bset_first(new_nodes[i]); |
1257 | struct bset *n2 = new_nodes[i - 1]->sets->data; | 1262 | struct bset *n2 = btree_bset_first(new_nodes[i - 1]); |
1258 | struct bkey *k, *last = NULL; | 1263 | struct bkey *k, *last = NULL; |
1259 | 1264 | ||
1260 | keys = 0; | 1265 | keys = 0; |
1261 | 1266 | ||
1262 | if (i > 1) { | 1267 | if (i > 1) { |
1263 | for (k = n2->start; | 1268 | for (k = n2->start; |
1264 | k < end(n2); | 1269 | k < bset_bkey_last(n2); |
1265 | k = bkey_next(k)) { | 1270 | k = bkey_next(k)) { |
1266 | if (__set_blocks(n1, n1->keys + keys + | 1271 | if (__set_blocks(n1, n1->keys + keys + |
1267 | bkey_u64s(k), b->c) > blocks) | 1272 | bkey_u64s(k), |
1273 | block_bytes(b->c)) > blocks) | ||
1268 | break; | 1274 | break; |
1269 | 1275 | ||
1270 | last = k; | 1276 | last = k; |
@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1280 | * though) | 1286 | * though) |
1281 | */ | 1287 | */ |
1282 | if (__set_blocks(n1, n1->keys + n2->keys, | 1288 | if (__set_blocks(n1, n1->keys + n2->keys, |
1283 | b->c) > btree_blocks(new_nodes[i])) | 1289 | block_bytes(b->c)) > |
1290 | btree_blocks(new_nodes[i])) | ||
1284 | goto out_nocoalesce; | 1291 | goto out_nocoalesce; |
1285 | 1292 | ||
1286 | keys = n2->keys; | 1293 | keys = n2->keys; |
@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1288 | last = &r->b->key; | 1295 | last = &r->b->key; |
1289 | } | 1296 | } |
1290 | 1297 | ||
1291 | BUG_ON(__set_blocks(n1, n1->keys + keys, | 1298 | BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > |
1292 | b->c) > btree_blocks(new_nodes[i])); | 1299 | btree_blocks(new_nodes[i])); |
1293 | 1300 | ||
1294 | if (last) | 1301 | if (last) |
1295 | bkey_copy_key(&new_nodes[i]->key, last); | 1302 | bkey_copy_key(&new_nodes[i]->key, last); |
1296 | 1303 | ||
1297 | memcpy(end(n1), | 1304 | memcpy(bset_bkey_last(n1), |
1298 | n2->start, | 1305 | n2->start, |
1299 | (void *) node(n2, keys) - (void *) n2->start); | 1306 | (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); |
1300 | 1307 | ||
1301 | n1->keys += keys; | 1308 | n1->keys += keys; |
1302 | r[i].keys = n1->keys; | 1309 | r[i].keys = n1->keys; |
1303 | 1310 | ||
1304 | memmove(n2->start, | 1311 | memmove(n2->start, |
1305 | node(n2, keys), | 1312 | bset_bkey_idx(n2, keys), |
1306 | (void *) end(n2) - (void *) node(n2, keys)); | 1313 | (void *) bset_bkey_last(n2) - |
1314 | (void *) bset_bkey_idx(n2, keys)); | ||
1307 | 1315 | ||
1308 | n2->keys -= keys; | 1316 | n2->keys -= keys; |
1309 | 1317 | ||
1310 | if (bch_keylist_realloc(keylist, | 1318 | if (__bch_keylist_realloc(keylist, |
1311 | KEY_PTRS(&new_nodes[i]->key), b->c)) | 1319 | bkey_u64s(&new_nodes[i]->key))) |
1312 | goto out_nocoalesce; | 1320 | goto out_nocoalesce; |
1313 | 1321 | ||
1314 | bch_btree_node_write(new_nodes[i], &cl); | 1322 | bch_btree_node_write(new_nodes[i], &cl); |
@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1316 | } | 1324 | } |
1317 | 1325 | ||
1318 | for (i = 0; i < nodes; i++) { | 1326 | for (i = 0; i < nodes; i++) { |
1319 | if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) | 1327 | if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) |
1320 | goto out_nocoalesce; | 1328 | goto out_nocoalesce; |
1321 | 1329 | ||
1322 | make_btree_freeing_key(r[i].b, keylist->top); | 1330 | make_btree_freeing_key(r[i].b, keylist->top); |
@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1324 | } | 1332 | } |
1325 | 1333 | ||
1326 | /* We emptied out this node */ | 1334 | /* We emptied out this node */ |
1327 | BUG_ON(new_nodes[0]->sets->data->keys); | 1335 | BUG_ON(btree_bset_first(new_nodes[0])->keys); |
1328 | btree_node_free(new_nodes[0]); | 1336 | btree_node_free(new_nodes[0]); |
1329 | rw_unlock(true, new_nodes[0]); | 1337 | rw_unlock(true, new_nodes[0]); |
1330 | 1338 | ||
@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b) | |||
1370 | struct btree_iter iter; | 1378 | struct btree_iter iter; |
1371 | unsigned ret = 0; | 1379 | unsigned ret = 0; |
1372 | 1380 | ||
1373 | for_each_key_filter(b, k, &iter, bch_ptr_bad) | 1381 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) |
1374 | ret += bkey_u64s(k); | 1382 | ret += bkey_u64s(k); |
1375 | 1383 | ||
1376 | return ret; | 1384 | return ret; |
@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1390 | struct gc_merge_info *last = r + GC_MERGE_NODES - 1; | 1398 | struct gc_merge_info *last = r + GC_MERGE_NODES - 1; |
1391 | 1399 | ||
1392 | bch_keylist_init(&keys); | 1400 | bch_keylist_init(&keys); |
1393 | bch_btree_iter_init(b, &iter, &b->c->gc_done); | 1401 | bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); |
1394 | 1402 | ||
1395 | for (i = 0; i < GC_MERGE_NODES; i++) | 1403 | for (i = 0; i < GC_MERGE_NODES; i++) |
1396 | r[i].b = ERR_PTR(-EINTR); | 1404 | r[i].b = ERR_PTR(-EINTR); |
1397 | 1405 | ||
1398 | while (1) { | 1406 | while (1) { |
1399 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 1407 | k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); |
1400 | if (k) { | 1408 | if (k) { |
1401 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); | 1409 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); |
1402 | if (IS_ERR(r->b)) { | 1410 | if (IS_ERR(r->b)) { |
@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1416 | 1424 | ||
1417 | if (!IS_ERR(last->b)) { | 1425 | if (!IS_ERR(last->b)) { |
1418 | should_rewrite = btree_gc_mark_node(last->b, gc); | 1426 | should_rewrite = btree_gc_mark_node(last->b, gc); |
1419 | if (should_rewrite) { | 1427 | if (should_rewrite && |
1428 | !btree_check_reserve(b, NULL)) { | ||
1420 | n = btree_node_alloc_replacement(last->b, | 1429 | n = btree_node_alloc_replacement(last->b, |
1421 | false); | 1430 | false); |
1422 | 1431 | ||
@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | |||
1705 | struct bucket *g; | 1714 | struct bucket *g; |
1706 | struct btree_iter iter; | 1715 | struct btree_iter iter; |
1707 | 1716 | ||
1708 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1717 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { |
1709 | for (i = 0; i < KEY_PTRS(k); i++) { | 1718 | for (i = 0; i < KEY_PTRS(k); i++) { |
1710 | if (!ptr_available(b->c, k, i)) | 1719 | if (!ptr_available(b->c, k, i)) |
1711 | continue; | 1720 | continue; |
@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | |||
1728 | } | 1737 | } |
1729 | 1738 | ||
1730 | if (b->level) { | 1739 | if (b->level) { |
1731 | bch_btree_iter_init(b, &iter, NULL); | 1740 | bch_btree_iter_init(&b->keys, &iter, NULL); |
1732 | 1741 | ||
1733 | do { | 1742 | do { |
1734 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 1743 | k = bch_btree_iter_next_filter(&iter, &b->keys, |
1744 | bch_ptr_bad); | ||
1735 | if (k) | 1745 | if (k) |
1736 | btree_node_prefetch(b->c, k, b->level - 1); | 1746 | btree_node_prefetch(b->c, k, b->level - 1); |
1737 | 1747 | ||
@@ -1774,235 +1784,36 @@ err: | |||
1774 | 1784 | ||
1775 | /* Btree insertion */ | 1785 | /* Btree insertion */ |
1776 | 1786 | ||
1777 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) | 1787 | static bool btree_insert_key(struct btree *b, struct bkey *k, |
1778 | { | 1788 | struct bkey *replace_key) |
1779 | struct bset *i = b->sets[b->nsets].data; | ||
1780 | |||
1781 | memmove((uint64_t *) where + bkey_u64s(insert), | ||
1782 | where, | ||
1783 | (void *) end(i) - (void *) where); | ||
1784 | |||
1785 | i->keys += bkey_u64s(insert); | ||
1786 | bkey_copy(where, insert); | ||
1787 | bch_bset_fix_lookup_table(b, where); | ||
1788 | } | ||
1789 | |||
1790 | static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, | ||
1791 | struct btree_iter *iter, | ||
1792 | struct bkey *replace_key) | ||
1793 | { | 1789 | { |
1794 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | 1790 | unsigned status; |
1795 | { | ||
1796 | if (KEY_DIRTY(k)) | ||
1797 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | ||
1798 | offset, -sectors); | ||
1799 | } | ||
1800 | |||
1801 | uint64_t old_offset; | ||
1802 | unsigned old_size, sectors_found = 0; | ||
1803 | |||
1804 | while (1) { | ||
1805 | struct bkey *k = bch_btree_iter_next(iter); | ||
1806 | if (!k || | ||
1807 | bkey_cmp(&START_KEY(k), insert) >= 0) | ||
1808 | break; | ||
1809 | |||
1810 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | ||
1811 | continue; | ||
1812 | |||
1813 | old_offset = KEY_START(k); | ||
1814 | old_size = KEY_SIZE(k); | ||
1815 | |||
1816 | /* | ||
1817 | * We might overlap with 0 size extents; we can't skip these | ||
1818 | * because if they're in the set we're inserting to we have to | ||
1819 | * adjust them so they don't overlap with the key we're | ||
1820 | * inserting. But we don't want to check them for replace | ||
1821 | * operations. | ||
1822 | */ | ||
1823 | |||
1824 | if (replace_key && KEY_SIZE(k)) { | ||
1825 | /* | ||
1826 | * k might have been split since we inserted/found the | ||
1827 | * key we're replacing | ||
1828 | */ | ||
1829 | unsigned i; | ||
1830 | uint64_t offset = KEY_START(k) - | ||
1831 | KEY_START(replace_key); | ||
1832 | |||
1833 | /* But it must be a subset of the replace key */ | ||
1834 | if (KEY_START(k) < KEY_START(replace_key) || | ||
1835 | KEY_OFFSET(k) > KEY_OFFSET(replace_key)) | ||
1836 | goto check_failed; | ||
1837 | |||
1838 | /* We didn't find a key that we were supposed to */ | ||
1839 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | ||
1840 | goto check_failed; | ||
1841 | |||
1842 | if (KEY_PTRS(k) != KEY_PTRS(replace_key) || | ||
1843 | KEY_DIRTY(k) != KEY_DIRTY(replace_key)) | ||
1844 | goto check_failed; | ||
1845 | |||
1846 | /* skip past gen */ | ||
1847 | offset <<= 8; | ||
1848 | |||
1849 | BUG_ON(!KEY_PTRS(replace_key)); | ||
1850 | 1791 | ||
1851 | for (i = 0; i < KEY_PTRS(replace_key); i++) | 1792 | BUG_ON(bkey_cmp(k, &b->key) > 0); |
1852 | if (k->ptr[i] != replace_key->ptr[i] + offset) | ||
1853 | goto check_failed; | ||
1854 | |||
1855 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | ||
1856 | } | ||
1857 | |||
1858 | if (bkey_cmp(insert, k) < 0 && | ||
1859 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | ||
1860 | /* | ||
1861 | * We overlapped in the middle of an existing key: that | ||
1862 | * means we have to split the old key. But we have to do | ||
1863 | * slightly different things depending on whether the | ||
1864 | * old key has been written out yet. | ||
1865 | */ | ||
1866 | |||
1867 | struct bkey *top; | ||
1868 | |||
1869 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); | ||
1870 | |||
1871 | if (bkey_written(b, k)) { | ||
1872 | /* | ||
1873 | * We insert a new key to cover the top of the | ||
1874 | * old key, and the old key is modified in place | ||
1875 | * to represent the bottom split. | ||
1876 | * | ||
1877 | * It's completely arbitrary whether the new key | ||
1878 | * is the top or the bottom, but it has to match | ||
1879 | * up with what btree_sort_fixup() does - it | ||
1880 | * doesn't check for this kind of overlap, it | ||
1881 | * depends on us inserting a new key for the top | ||
1882 | * here. | ||
1883 | */ | ||
1884 | top = bch_bset_search(b, &b->sets[b->nsets], | ||
1885 | insert); | ||
1886 | shift_keys(b, top, k); | ||
1887 | } else { | ||
1888 | BKEY_PADDED(key) temp; | ||
1889 | bkey_copy(&temp.key, k); | ||
1890 | shift_keys(b, k, &temp.key); | ||
1891 | top = bkey_next(k); | ||
1892 | } | ||
1893 | |||
1894 | bch_cut_front(insert, top); | ||
1895 | bch_cut_back(&START_KEY(insert), k); | ||
1896 | bch_bset_fix_invalidated_key(b, k); | ||
1897 | return false; | ||
1898 | } | ||
1899 | |||
1900 | if (bkey_cmp(insert, k) < 0) { | ||
1901 | bch_cut_front(insert, k); | ||
1902 | } else { | ||
1903 | if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) | ||
1904 | old_offset = KEY_START(insert); | ||
1905 | |||
1906 | if (bkey_written(b, k) && | ||
1907 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | ||
1908 | /* | ||
1909 | * Completely overwrote, so we don't have to | ||
1910 | * invalidate the binary search tree | ||
1911 | */ | ||
1912 | bch_cut_front(k, k); | ||
1913 | } else { | ||
1914 | __bch_cut_back(&START_KEY(insert), k); | ||
1915 | bch_bset_fix_invalidated_key(b, k); | ||
1916 | } | ||
1917 | } | ||
1918 | |||
1919 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); | ||
1920 | } | ||
1921 | 1793 | ||
1922 | check_failed: | 1794 | status = bch_btree_insert_key(&b->keys, k, replace_key); |
1923 | if (replace_key) { | 1795 | if (status != BTREE_INSERT_STATUS_NO_INSERT) { |
1924 | if (!sectors_found) { | 1796 | bch_check_keys(&b->keys, "%u for %s", status, |
1925 | return true; | 1797 | replace_key ? "replace" : "insert"); |
1926 | } else if (sectors_found < KEY_SIZE(insert)) { | ||
1927 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | ||
1928 | (KEY_SIZE(insert) - sectors_found)); | ||
1929 | SET_KEY_SIZE(insert, sectors_found); | ||
1930 | } | ||
1931 | } | ||
1932 | 1798 | ||
1933 | return false; | 1799 | trace_bcache_btree_insert_key(b, k, replace_key != NULL, |
1800 | status); | ||
1801 | return true; | ||
1802 | } else | ||
1803 | return false; | ||
1934 | } | 1804 | } |
1935 | 1805 | ||
1936 | static bool btree_insert_key(struct btree *b, struct btree_op *op, | 1806 | static size_t insert_u64s_remaining(struct btree *b) |
1937 | struct bkey *k, struct bkey *replace_key) | ||
1938 | { | 1807 | { |
1939 | struct bset *i = b->sets[b->nsets].data; | 1808 | long ret = bch_btree_keys_u64s_remaining(&b->keys); |
1940 | struct bkey *m, *prev; | ||
1941 | unsigned status = BTREE_INSERT_STATUS_INSERT; | ||
1942 | |||
1943 | BUG_ON(bkey_cmp(k, &b->key) > 0); | ||
1944 | BUG_ON(b->level && !KEY_PTRS(k)); | ||
1945 | BUG_ON(!b->level && !KEY_OFFSET(k)); | ||
1946 | |||
1947 | if (!b->level) { | ||
1948 | struct btree_iter iter; | ||
1949 | |||
1950 | /* | ||
1951 | * bset_search() returns the first key that is strictly greater | ||
1952 | * than the search key - but for back merging, we want to find | ||
1953 | * the previous key. | ||
1954 | */ | ||
1955 | prev = NULL; | ||
1956 | m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); | ||
1957 | 1809 | ||
1958 | if (fix_overlapping_extents(b, k, &iter, replace_key)) { | 1810 | /* |
1959 | op->insert_collision = true; | 1811 | * Might land in the middle of an existing extent and have to split it |
1960 | return false; | 1812 | */ |
1961 | } | 1813 | if (b->keys.ops->is_extents) |
1962 | 1814 | ret -= KEY_MAX_U64S; | |
1963 | if (KEY_DIRTY(k)) | ||
1964 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | ||
1965 | KEY_START(k), KEY_SIZE(k)); | ||
1966 | |||
1967 | while (m != end(i) && | ||
1968 | bkey_cmp(k, &START_KEY(m)) > 0) | ||
1969 | prev = m, m = bkey_next(m); | ||
1970 | |||
1971 | if (key_merging_disabled(b->c)) | ||
1972 | goto insert; | ||
1973 | |||
1974 | /* prev is in the tree, if we merge we're done */ | ||
1975 | status = BTREE_INSERT_STATUS_BACK_MERGE; | ||
1976 | if (prev && | ||
1977 | bch_bkey_try_merge(b, prev, k)) | ||
1978 | goto merged; | ||
1979 | |||
1980 | status = BTREE_INSERT_STATUS_OVERWROTE; | ||
1981 | if (m != end(i) && | ||
1982 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | ||
1983 | goto copy; | ||
1984 | |||
1985 | status = BTREE_INSERT_STATUS_FRONT_MERGE; | ||
1986 | if (m != end(i) && | ||
1987 | bch_bkey_try_merge(b, k, m)) | ||
1988 | goto copy; | ||
1989 | } else { | ||
1990 | BUG_ON(replace_key); | ||
1991 | m = bch_bset_search(b, &b->sets[b->nsets], k); | ||
1992 | } | ||
1993 | |||
1994 | insert: shift_keys(b, m, k); | ||
1995 | copy: bkey_copy(m, k); | ||
1996 | merged: | ||
1997 | bch_check_keys(b, "%u for %s", status, | ||
1998 | replace_key ? "replace" : "insert"); | ||
1999 | |||
2000 | if (b->level && !KEY_OFFSET(k)) | ||
2001 | btree_current_write(b)->prio_blocked++; | ||
2002 | |||
2003 | trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); | ||
2004 | 1815 | ||
2005 | return true; | 1816 | return max(ret, 0L); |
2006 | } | 1817 | } |
2007 | 1818 | ||
2008 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, | 1819 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, |
@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, | |||
2010 | struct bkey *replace_key) | 1821 | struct bkey *replace_key) |
2011 | { | 1822 | { |
2012 | bool ret = false; | 1823 | bool ret = false; |
2013 | int oldsize = bch_count_data(b); | 1824 | int oldsize = bch_count_data(&b->keys); |
2014 | 1825 | ||
2015 | while (!bch_keylist_empty(insert_keys)) { | 1826 | while (!bch_keylist_empty(insert_keys)) { |
2016 | struct bset *i = write_block(b); | ||
2017 | struct bkey *k = insert_keys->keys; | 1827 | struct bkey *k = insert_keys->keys; |
2018 | 1828 | ||
2019 | if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) | 1829 | if (bkey_u64s(k) > insert_u64s_remaining(b)) |
2020 | > btree_blocks(b)) | ||
2021 | break; | 1830 | break; |
2022 | 1831 | ||
2023 | if (bkey_cmp(k, &b->key) <= 0) { | 1832 | if (bkey_cmp(k, &b->key) <= 0) { |
2024 | if (!b->level) | 1833 | if (!b->level) |
2025 | bkey_put(b->c, k); | 1834 | bkey_put(b->c, k); |
2026 | 1835 | ||
2027 | ret |= btree_insert_key(b, op, k, replace_key); | 1836 | ret |= btree_insert_key(b, k, replace_key); |
2028 | bch_keylist_pop_front(insert_keys); | 1837 | bch_keylist_pop_front(insert_keys); |
2029 | } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { | 1838 | } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { |
2030 | BKEY_PADDED(key) temp; | 1839 | BKEY_PADDED(key) temp; |
@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, | |||
2033 | bch_cut_back(&b->key, &temp.key); | 1842 | bch_cut_back(&b->key, &temp.key); |
2034 | bch_cut_front(&b->key, insert_keys->keys); | 1843 | bch_cut_front(&b->key, insert_keys->keys); |
2035 | 1844 | ||
2036 | ret |= btree_insert_key(b, op, &temp.key, replace_key); | 1845 | ret |= btree_insert_key(b, &temp.key, replace_key); |
2037 | break; | 1846 | break; |
2038 | } else { | 1847 | } else { |
2039 | break; | 1848 | break; |
2040 | } | 1849 | } |
2041 | } | 1850 | } |
2042 | 1851 | ||
1852 | if (!ret) | ||
1853 | op->insert_collision = true; | ||
1854 | |||
2043 | BUG_ON(!bch_keylist_empty(insert_keys) && b->level); | 1855 | BUG_ON(!bch_keylist_empty(insert_keys) && b->level); |
2044 | 1856 | ||
2045 | BUG_ON(bch_count_data(b) < oldsize); | 1857 | BUG_ON(bch_count_data(&b->keys) < oldsize); |
2046 | return ret; | 1858 | return ret; |
2047 | } | 1859 | } |
2048 | 1860 | ||
@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
2059 | closure_init_stack(&cl); | 1871 | closure_init_stack(&cl); |
2060 | bch_keylist_init(&parent_keys); | 1872 | bch_keylist_init(&parent_keys); |
2061 | 1873 | ||
1874 | if (!b->level && | ||
1875 | btree_check_reserve(b, op)) | ||
1876 | return -EINTR; | ||
1877 | |||
2062 | n1 = btree_node_alloc_replacement(b, true); | 1878 | n1 = btree_node_alloc_replacement(b, true); |
2063 | if (IS_ERR(n1)) | 1879 | if (IS_ERR(n1)) |
2064 | goto err; | 1880 | goto err; |
2065 | 1881 | ||
2066 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | 1882 | split = set_blocks(btree_bset_first(n1), |
1883 | block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; | ||
2067 | 1884 | ||
2068 | if (split) { | 1885 | if (split) { |
2069 | unsigned keys = 0; | 1886 | unsigned keys = 0; |
2070 | 1887 | ||
2071 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); | 1888 | trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); |
2072 | 1889 | ||
2073 | n2 = bch_btree_node_alloc(b->c, b->level, true); | 1890 | n2 = bch_btree_node_alloc(b->c, b->level, true); |
2074 | if (IS_ERR(n2)) | 1891 | if (IS_ERR(n2)) |
@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
2087 | * search tree yet | 1904 | * search tree yet |
2088 | */ | 1905 | */ |
2089 | 1906 | ||
2090 | while (keys < (n1->sets[0].data->keys * 3) / 5) | 1907 | while (keys < (btree_bset_first(n1)->keys * 3) / 5) |
2091 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | 1908 | keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), |
1909 | keys)); | ||
2092 | 1910 | ||
2093 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); | 1911 | bkey_copy_key(&n1->key, |
2094 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | 1912 | bset_bkey_idx(btree_bset_first(n1), keys)); |
1913 | keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); | ||
2095 | 1914 | ||
2096 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; | 1915 | btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; |
2097 | n1->sets[0].data->keys = keys; | 1916 | btree_bset_first(n1)->keys = keys; |
2098 | 1917 | ||
2099 | memcpy(n2->sets[0].data->start, | 1918 | memcpy(btree_bset_first(n2)->start, |
2100 | end(n1->sets[0].data), | 1919 | bset_bkey_last(btree_bset_first(n1)), |
2101 | n2->sets[0].data->keys * sizeof(uint64_t)); | 1920 | btree_bset_first(n2)->keys * sizeof(uint64_t)); |
2102 | 1921 | ||
2103 | bkey_copy_key(&n2->key, &b->key); | 1922 | bkey_copy_key(&n2->key, &b->key); |
2104 | 1923 | ||
@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
2106 | bch_btree_node_write(n2, &cl); | 1925 | bch_btree_node_write(n2, &cl); |
2107 | rw_unlock(true, n2); | 1926 | rw_unlock(true, n2); |
2108 | } else { | 1927 | } else { |
2109 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); | 1928 | trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); |
2110 | 1929 | ||
2111 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); | 1930 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); |
2112 | } | 1931 | } |
@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
2149 | 1968 | ||
2150 | return 0; | 1969 | return 0; |
2151 | err_free2: | 1970 | err_free2: |
1971 | bkey_put(b->c, &n2->key); | ||
2152 | btree_node_free(n2); | 1972 | btree_node_free(n2); |
2153 | rw_unlock(true, n2); | 1973 | rw_unlock(true, n2); |
2154 | err_free1: | 1974 | err_free1: |
1975 | bkey_put(b->c, &n1->key); | ||
2155 | btree_node_free(n1); | 1976 | btree_node_free(n1); |
2156 | rw_unlock(true, n1); | 1977 | rw_unlock(true, n1); |
2157 | err: | 1978 | err: |
1979 | WARN(1, "bcache: btree split failed"); | ||
1980 | |||
2158 | if (n3 == ERR_PTR(-EAGAIN) || | 1981 | if (n3 == ERR_PTR(-EAGAIN) || |
2159 | n2 == ERR_PTR(-EAGAIN) || | 1982 | n2 == ERR_PTR(-EAGAIN) || |
2160 | n1 == ERR_PTR(-EAGAIN)) | 1983 | n1 == ERR_PTR(-EAGAIN)) |
2161 | return -EAGAIN; | 1984 | return -EAGAIN; |
2162 | 1985 | ||
2163 | pr_warn("couldn't split"); | ||
2164 | return -ENOMEM; | 1986 | return -ENOMEM; |
2165 | } | 1987 | } |
2166 | 1988 | ||
@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, | |||
2171 | { | 1993 | { |
2172 | BUG_ON(b->level && replace_key); | 1994 | BUG_ON(b->level && replace_key); |
2173 | 1995 | ||
2174 | if (should_split(b)) { | 1996 | if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { |
2175 | if (current->bio_list) { | 1997 | if (current->bio_list) { |
2176 | op->lock = b->c->root->level + 1; | 1998 | op->lock = b->c->root->level + 1; |
2177 | return -EAGAIN; | 1999 | return -EAGAIN; |
@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, | |||
2180 | return -EINTR; | 2002 | return -EINTR; |
2181 | } else { | 2003 | } else { |
2182 | /* Invalidated all iterators */ | 2004 | /* Invalidated all iterators */ |
2183 | return btree_split(b, op, insert_keys, replace_key) ?: | 2005 | int ret = btree_split(b, op, insert_keys, replace_key); |
2184 | -EINTR; | 2006 | |
2007 | return bch_keylist_empty(insert_keys) ? | ||
2008 | 0 : ret ?: -EINTR; | ||
2185 | } | 2009 | } |
2186 | } else { | 2010 | } else { |
2187 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | 2011 | BUG_ON(write_block(b) != btree_bset_last(b)); |
2188 | 2012 | ||
2189 | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { | 2013 | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { |
2190 | if (!b->level) | 2014 | if (!b->level) |
@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, | |||
2323 | struct bkey *k; | 2147 | struct bkey *k; |
2324 | struct btree_iter iter; | 2148 | struct btree_iter iter; |
2325 | 2149 | ||
2326 | bch_btree_iter_init(b, &iter, from); | 2150 | bch_btree_iter_init(&b->keys, &iter, from); |
2327 | 2151 | ||
2328 | while ((k = bch_btree_iter_next_filter(&iter, b, | 2152 | while ((k = bch_btree_iter_next_filter(&iter, &b->keys, |
2329 | bch_ptr_bad))) { | 2153 | bch_ptr_bad))) { |
2330 | ret = btree(map_nodes_recurse, k, b, | 2154 | ret = btree(map_nodes_recurse, k, b, |
2331 | op, from, fn, flags); | 2155 | op, from, fn, flags); |
@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, | |||
2356 | struct bkey *k; | 2180 | struct bkey *k; |
2357 | struct btree_iter iter; | 2181 | struct btree_iter iter; |
2358 | 2182 | ||
2359 | bch_btree_iter_init(b, &iter, from); | 2183 | bch_btree_iter_init(&b->keys, &iter, from); |
2360 | 2184 | ||
2361 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { | 2185 | while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { |
2362 | ret = !b->level | 2186 | ret = !b->level |
2363 | ? fn(op, b, k) | 2187 | ? fn(op, b, k) |
2364 | : btree(map_keys_recurse, k, b, op, from, fn, flags); | 2188 | : btree(map_keys_recurse, k, b, op, from, fn, flags); |
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 767e75570896..af065e97e55c 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
@@ -130,20 +130,12 @@ struct btree { | |||
130 | unsigned long flags; | 130 | unsigned long flags; |
131 | uint16_t written; /* would be nice to kill */ | 131 | uint16_t written; /* would be nice to kill */ |
132 | uint8_t level; | 132 | uint8_t level; |
133 | uint8_t nsets; | 133 | |
134 | uint8_t page_order; | 134 | struct btree_keys keys; |
135 | |||
136 | /* | ||
137 | * Set of sorted keys - the real btree node - plus a binary search tree | ||
138 | * | ||
139 | * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||
140 | * to the memory we have allocated for this btree node. Additionally, | ||
141 | * set[0]->data points to the entire btree node as it exists on disk. | ||
142 | */ | ||
143 | struct bset_tree sets[MAX_BSETS]; | ||
144 | 135 | ||
145 | /* For outstanding btree writes, used as a lock - protects write_idx */ | 136 | /* For outstanding btree writes, used as a lock - protects write_idx */ |
146 | struct closure_with_waitlist io; | 137 | struct closure io; |
138 | struct semaphore io_mutex; | ||
147 | 139 | ||
148 | struct list_head list; | 140 | struct list_head list; |
149 | struct delayed_work work; | 141 | struct delayed_work work; |
@@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b) | |||
179 | return b->writes + (btree_node_write_idx(b) ^ 1); | 171 | return b->writes + (btree_node_write_idx(b) ^ 1); |
180 | } | 172 | } |
181 | 173 | ||
182 | static inline unsigned bset_offset(struct btree *b, struct bset *i) | 174 | static inline struct bset *btree_bset_first(struct btree *b) |
183 | { | 175 | { |
184 | return (((size_t) i) - ((size_t) b->sets->data)) >> 9; | 176 | return b->keys.set->data; |
185 | } | 177 | } |
186 | 178 | ||
187 | static inline struct bset *write_block(struct btree *b) | 179 | static inline struct bset *btree_bset_last(struct btree *b) |
188 | { | 180 | { |
189 | return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); | 181 | return bset_tree_last(&b->keys)->data; |
190 | } | 182 | } |
191 | 183 | ||
192 | static inline bool bset_written(struct btree *b, struct bset_tree *t) | 184 | static inline unsigned bset_block_offset(struct btree *b, struct bset *i) |
193 | { | 185 | { |
194 | return t->data < write_block(b); | 186 | return bset_sector_offset(&b->keys, i) >> b->c->block_bits; |
195 | } | ||
196 | |||
197 | static inline bool bkey_written(struct btree *b, struct bkey *k) | ||
198 | { | ||
199 | return k < write_block(b)->start; | ||
200 | } | 187 | } |
201 | 188 | ||
202 | static inline void set_gc_sectors(struct cache_set *c) | 189 | static inline void set_gc_sectors(struct cache_set *c) |
@@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c) | |||
204 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); | 191 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); |
205 | } | 192 | } |
206 | 193 | ||
207 | static inline struct bkey *bch_btree_iter_init(struct btree *b, | ||
208 | struct btree_iter *iter, | ||
209 | struct bkey *search) | ||
210 | { | ||
211 | return __bch_btree_iter_init(b, iter, search, b->sets); | ||
212 | } | ||
213 | |||
214 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||
215 | { | ||
216 | if (b->level) | ||
217 | return bch_btree_ptr_invalid(b->c, k); | ||
218 | else | ||
219 | return bch_extent_ptr_invalid(b->c, k); | ||
220 | } | ||
221 | |||
222 | void bkey_put(struct cache_set *c, struct bkey *k); | 194 | void bkey_put(struct cache_set *c, struct bkey *k); |
223 | 195 | ||
224 | /* Looping macros */ | 196 | /* Looping macros */ |
@@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k); | |||
229 | iter++) \ | 201 | iter++) \ |
230 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) | 202 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) |
231 | 203 | ||
232 | #define for_each_key_filter(b, k, iter, filter) \ | ||
233 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
234 | ((k) = bch_btree_iter_next_filter((iter), b, filter));) | ||
235 | |||
236 | #define for_each_key(b, k, iter) \ | ||
237 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
238 | ((k) = bch_btree_iter_next(iter));) | ||
239 | |||
240 | /* Recursing down the btree */ | 204 | /* Recursing down the btree */ |
241 | 205 | ||
242 | struct btree_op { | 206 | struct btree_op { |
207 | /* for waiting on btree reserve in btree_split() */ | ||
208 | wait_queue_t wait; | ||
209 | |||
243 | /* Btree level at which we start taking write locks */ | 210 | /* Btree level at which we start taking write locks */ |
244 | short lock; | 211 | short lock; |
245 | 212 | ||
@@ -249,6 +216,7 @@ struct btree_op { | |||
249 | static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) | 216 | static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) |
250 | { | 217 | { |
251 | memset(op, 0, sizeof(struct btree_op)); | 218 | memset(op, 0, sizeof(struct btree_op)); |
219 | init_wait(&op->wait); | ||
252 | op->lock = write_lock_level; | 220 | op->lock = write_lock_level; |
253 | } | 221 | } |
254 | 222 | ||
@@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b) | |||
267 | (w ? up_write : up_read)(&b->lock); | 235 | (w ? up_write : up_read)(&b->lock); |
268 | } | 236 | } |
269 | 237 | ||
270 | void bch_btree_node_read(struct btree *); | 238 | void bch_btree_node_read_done(struct btree *); |
271 | void bch_btree_node_write(struct btree *, struct closure *); | 239 | void bch_btree_node_write(struct btree *, struct closure *); |
272 | 240 | ||
273 | void bch_btree_set_root(struct btree *); | 241 | void bch_btree_set_root(struct btree *); |
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index dfff2410322e..7a228de95fd7 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c | |||
@@ -11,19 +11,6 @@ | |||
11 | 11 | ||
12 | #include "closure.h" | 12 | #include "closure.h" |
13 | 13 | ||
14 | #define CL_FIELD(type, field) \ | ||
15 | case TYPE_ ## type: \ | ||
16 | return &container_of(cl, struct type, cl)->field | ||
17 | |||
18 | static struct closure_waitlist *closure_waitlist(struct closure *cl) | ||
19 | { | ||
20 | switch (cl->type) { | ||
21 | CL_FIELD(closure_with_waitlist, wait); | ||
22 | default: | ||
23 | return NULL; | ||
24 | } | ||
25 | } | ||
26 | |||
27 | static inline void closure_put_after_sub(struct closure *cl, int flags) | 14 | static inline void closure_put_after_sub(struct closure *cl, int flags) |
28 | { | 15 | { |
29 | int r = flags & CLOSURE_REMAINING_MASK; | 16 | int r = flags & CLOSURE_REMAINING_MASK; |
@@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) | |||
42 | closure_queue(cl); | 29 | closure_queue(cl); |
43 | } else { | 30 | } else { |
44 | struct closure *parent = cl->parent; | 31 | struct closure *parent = cl->parent; |
45 | struct closure_waitlist *wait = closure_waitlist(cl); | ||
46 | closure_fn *destructor = cl->fn; | 32 | closure_fn *destructor = cl->fn; |
47 | 33 | ||
48 | closure_debug_destroy(cl); | 34 | closure_debug_destroy(cl); |
49 | 35 | ||
50 | smp_mb(); | ||
51 | atomic_set(&cl->remaining, -1); | ||
52 | |||
53 | if (wait) | ||
54 | closure_wake_up(wait); | ||
55 | |||
56 | if (destructor) | 36 | if (destructor) |
57 | destructor(cl); | 37 | destructor(cl); |
58 | 38 | ||
@@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v) | |||
69 | } | 49 | } |
70 | EXPORT_SYMBOL(closure_sub); | 50 | EXPORT_SYMBOL(closure_sub); |
71 | 51 | ||
52 | /** | ||
53 | * closure_put - decrement a closure's refcount | ||
54 | */ | ||
72 | void closure_put(struct closure *cl) | 55 | void closure_put(struct closure *cl) |
73 | { | 56 | { |
74 | closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); | 57 | closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); |
75 | } | 58 | } |
76 | EXPORT_SYMBOL(closure_put); | 59 | EXPORT_SYMBOL(closure_put); |
77 | 60 | ||
78 | static void set_waiting(struct closure *cl, unsigned long f) | 61 | /** |
79 | { | 62 | * closure_wake_up - wake up all closures on a wait list, without memory barrier |
80 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 63 | */ |
81 | cl->waiting_on = f; | ||
82 | #endif | ||
83 | } | ||
84 | |||
85 | void __closure_wake_up(struct closure_waitlist *wait_list) | 64 | void __closure_wake_up(struct closure_waitlist *wait_list) |
86 | { | 65 | { |
87 | struct llist_node *list; | 66 | struct llist_node *list; |
@@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list) | |||
106 | cl = container_of(reverse, struct closure, list); | 85 | cl = container_of(reverse, struct closure, list); |
107 | reverse = llist_next(reverse); | 86 | reverse = llist_next(reverse); |
108 | 87 | ||
109 | set_waiting(cl, 0); | 88 | closure_set_waiting(cl, 0); |
110 | closure_sub(cl, CLOSURE_WAITING + 1); | 89 | closure_sub(cl, CLOSURE_WAITING + 1); |
111 | } | 90 | } |
112 | } | 91 | } |
113 | EXPORT_SYMBOL(__closure_wake_up); | 92 | EXPORT_SYMBOL(__closure_wake_up); |
114 | 93 | ||
115 | bool closure_wait(struct closure_waitlist *list, struct closure *cl) | 94 | /** |
95 | * closure_wait - add a closure to a waitlist | ||
96 | * | ||
97 | * @waitlist will own a ref on @cl, which will be released when | ||
98 | * closure_wake_up() is called on @waitlist. | ||
99 | * | ||
100 | */ | ||
101 | bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) | ||
116 | { | 102 | { |
117 | if (atomic_read(&cl->remaining) & CLOSURE_WAITING) | 103 | if (atomic_read(&cl->remaining) & CLOSURE_WAITING) |
118 | return false; | 104 | return false; |
119 | 105 | ||
120 | set_waiting(cl, _RET_IP_); | 106 | closure_set_waiting(cl, _RET_IP_); |
121 | atomic_add(CLOSURE_WAITING + 1, &cl->remaining); | 107 | atomic_add(CLOSURE_WAITING + 1, &cl->remaining); |
122 | llist_add(&cl->list, &list->list); | 108 | llist_add(&cl->list, &waitlist->list); |
123 | 109 | ||
124 | return true; | 110 | return true; |
125 | } | 111 | } |
126 | EXPORT_SYMBOL(closure_wait); | 112 | EXPORT_SYMBOL(closure_wait); |
127 | 113 | ||
128 | /** | 114 | /** |
129 | * closure_sync() - sleep until a closure a closure has nothing left to wait on | 115 | * closure_sync - sleep until a closure a closure has nothing left to wait on |
130 | * | 116 | * |
131 | * Sleeps until the refcount hits 1 - the thread that's running the closure owns | 117 | * Sleeps until the refcount hits 1 - the thread that's running the closure owns |
132 | * the last refcount. | 118 | * the last refcount. |
@@ -148,46 +134,6 @@ void closure_sync(struct closure *cl) | |||
148 | } | 134 | } |
149 | EXPORT_SYMBOL(closure_sync); | 135 | EXPORT_SYMBOL(closure_sync); |
150 | 136 | ||
151 | /** | ||
152 | * closure_trylock() - try to acquire the closure, without waiting | ||
153 | * @cl: closure to lock | ||
154 | * | ||
155 | * Returns true if the closure was succesfully locked. | ||
156 | */ | ||
157 | bool closure_trylock(struct closure *cl, struct closure *parent) | ||
158 | { | ||
159 | if (atomic_cmpxchg(&cl->remaining, -1, | ||
160 | CLOSURE_REMAINING_INITIALIZER) != -1) | ||
161 | return false; | ||
162 | |||
163 | smp_mb(); | ||
164 | |||
165 | cl->parent = parent; | ||
166 | if (parent) | ||
167 | closure_get(parent); | ||
168 | |||
169 | closure_set_ret_ip(cl); | ||
170 | closure_debug_create(cl); | ||
171 | return true; | ||
172 | } | ||
173 | EXPORT_SYMBOL(closure_trylock); | ||
174 | |||
175 | void __closure_lock(struct closure *cl, struct closure *parent, | ||
176 | struct closure_waitlist *wait_list) | ||
177 | { | ||
178 | struct closure wait; | ||
179 | closure_init_stack(&wait); | ||
180 | |||
181 | while (1) { | ||
182 | if (closure_trylock(cl, parent)) | ||
183 | return; | ||
184 | |||
185 | closure_wait_event(wait_list, &wait, | ||
186 | atomic_read(&cl->remaining) == -1); | ||
187 | } | ||
188 | } | ||
189 | EXPORT_SYMBOL(__closure_lock); | ||
190 | |||
191 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 137 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
192 | 138 | ||
193 | static LIST_HEAD(closure_list); | 139 | static LIST_HEAD(closure_list); |
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 9762f1be3304..7ef7461912be 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h | |||
@@ -72,30 +72,6 @@ | |||
72 | * closure - _always_ use continue_at(). Doing so consistently will help | 72 | * closure - _always_ use continue_at(). Doing so consistently will help |
73 | * eliminate an entire class of particularly pernicious races. | 73 | * eliminate an entire class of particularly pernicious races. |
74 | * | 74 | * |
75 | * For a closure to wait on an arbitrary event, we need to introduce waitlists: | ||
76 | * | ||
77 | * struct closure_waitlist list; | ||
78 | * closure_wait_event(list, cl, condition); | ||
79 | * closure_wake_up(wait_list); | ||
80 | * | ||
81 | * These work analagously to wait_event() and wake_up() - except that instead of | ||
82 | * operating on the current thread (for wait_event()) and lists of threads, they | ||
83 | * operate on an explicit closure and lists of closures. | ||
84 | * | ||
85 | * Because it's a closure we can now wait either synchronously or | ||
86 | * asynchronously. closure_wait_event() returns the current value of the | ||
87 | * condition, and if it returned false continue_at() or closure_sync() can be | ||
88 | * used to wait for it to become true. | ||
89 | * | ||
90 | * It's useful for waiting on things when you can't sleep in the context in | ||
91 | * which you must check the condition (perhaps a spinlock held, or you might be | ||
92 | * beneath generic_make_request() - in which case you can't sleep on IO). | ||
93 | * | ||
94 | * closure_wait_event() will wait either synchronously or asynchronously, | ||
95 | * depending on whether the closure is in blocking mode or not. You can pick a | ||
96 | * mode explicitly with closure_wait_event_sync() and | ||
97 | * closure_wait_event_async(), which do just what you might expect. | ||
98 | * | ||
99 | * Lastly, you might have a wait list dedicated to a specific event, and have no | 75 | * Lastly, you might have a wait list dedicated to a specific event, and have no |
100 | * need for specifying the condition - you just want to wait until someone runs | 76 | * need for specifying the condition - you just want to wait until someone runs |
101 | * closure_wake_up() on the appropriate wait list. In that case, just use | 77 | * closure_wake_up() on the appropriate wait list. In that case, just use |
@@ -121,40 +97,6 @@ | |||
121 | * All this implies that a closure should typically be embedded in a particular | 97 | * All this implies that a closure should typically be embedded in a particular |
122 | * struct (which its refcount will normally control the lifetime of), and that | 98 | * struct (which its refcount will normally control the lifetime of), and that |
123 | * struct can very much be thought of as a stack frame. | 99 | * struct can very much be thought of as a stack frame. |
124 | * | ||
125 | * Locking: | ||
126 | * | ||
127 | * Closures are based on work items but they can be thought of as more like | ||
128 | * threads - in that like threads and unlike work items they have a well | ||
129 | * defined lifetime; they are created (with closure_init()) and eventually | ||
130 | * complete after a continue_at(cl, NULL, NULL). | ||
131 | * | ||
132 | * Suppose you've got some larger structure with a closure embedded in it that's | ||
133 | * used for periodically doing garbage collection. You only want one garbage | ||
134 | * collection happening at a time, so the natural thing to do is protect it with | ||
135 | * a lock. However, it's difficult to use a lock protecting a closure correctly | ||
136 | * because the unlock should come after the last continue_to() (additionally, if | ||
137 | * you're using the closure asynchronously a mutex won't work since a mutex has | ||
138 | * to be unlocked by the same process that locked it). | ||
139 | * | ||
140 | * So to make it less error prone and more efficient, we also have the ability | ||
141 | * to use closures as locks: | ||
142 | * | ||
143 | * closure_init_unlocked(); | ||
144 | * closure_trylock(); | ||
145 | * | ||
146 | * That's all we need for trylock() - the last closure_put() implicitly unlocks | ||
147 | * it for you. But for closure_lock(), we also need a wait list: | ||
148 | * | ||
149 | * struct closure_with_waitlist frobnicator_cl; | ||
150 | * | ||
151 | * closure_init_unlocked(&frobnicator_cl); | ||
152 | * closure_lock(&frobnicator_cl); | ||
153 | * | ||
154 | * A closure_with_waitlist embeds a closure and a wait list - much like struct | ||
155 | * delayed_work embeds a work item and a timer_list. The important thing is, use | ||
156 | * it exactly like you would a regular closure and closure_put() will magically | ||
157 | * handle everything for you. | ||
158 | */ | 100 | */ |
159 | 101 | ||
160 | struct closure; | 102 | struct closure; |
@@ -164,12 +106,6 @@ struct closure_waitlist { | |||
164 | struct llist_head list; | 106 | struct llist_head list; |
165 | }; | 107 | }; |
166 | 108 | ||
167 | enum closure_type { | ||
168 | TYPE_closure = 0, | ||
169 | TYPE_closure_with_waitlist = 1, | ||
170 | MAX_CLOSURE_TYPE = 1, | ||
171 | }; | ||
172 | |||
173 | enum closure_state { | 109 | enum closure_state { |
174 | /* | 110 | /* |
175 | * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by | 111 | * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by |
@@ -224,8 +160,6 @@ struct closure { | |||
224 | 160 | ||
225 | atomic_t remaining; | 161 | atomic_t remaining; |
226 | 162 | ||
227 | enum closure_type type; | ||
228 | |||
229 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 163 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
230 | #define CLOSURE_MAGIC_DEAD 0xc054dead | 164 | #define CLOSURE_MAGIC_DEAD 0xc054dead |
231 | #define CLOSURE_MAGIC_ALIVE 0xc054a11e | 165 | #define CLOSURE_MAGIC_ALIVE 0xc054a11e |
@@ -237,34 +171,12 @@ struct closure { | |||
237 | #endif | 171 | #endif |
238 | }; | 172 | }; |
239 | 173 | ||
240 | struct closure_with_waitlist { | ||
241 | struct closure cl; | ||
242 | struct closure_waitlist wait; | ||
243 | }; | ||
244 | |||
245 | extern unsigned invalid_closure_type(void); | ||
246 | |||
247 | #define __CLOSURE_TYPE(cl, _t) \ | ||
248 | __builtin_types_compatible_p(typeof(cl), struct _t) \ | ||
249 | ? TYPE_ ## _t : \ | ||
250 | |||
251 | #define __closure_type(cl) \ | ||
252 | ( \ | ||
253 | __CLOSURE_TYPE(cl, closure) \ | ||
254 | __CLOSURE_TYPE(cl, closure_with_waitlist) \ | ||
255 | invalid_closure_type() \ | ||
256 | ) | ||
257 | |||
258 | void closure_sub(struct closure *cl, int v); | 174 | void closure_sub(struct closure *cl, int v); |
259 | void closure_put(struct closure *cl); | 175 | void closure_put(struct closure *cl); |
260 | void __closure_wake_up(struct closure_waitlist *list); | 176 | void __closure_wake_up(struct closure_waitlist *list); |
261 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); | 177 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); |
262 | void closure_sync(struct closure *cl); | 178 | void closure_sync(struct closure *cl); |
263 | 179 | ||
264 | bool closure_trylock(struct closure *cl, struct closure *parent); | ||
265 | void __closure_lock(struct closure *cl, struct closure *parent, | ||
266 | struct closure_waitlist *wait_list); | ||
267 | |||
268 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 180 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
269 | 181 | ||
270 | void closure_debug_init(void); | 182 | void closure_debug_init(void); |
@@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl) | |||
293 | #endif | 205 | #endif |
294 | } | 206 | } |
295 | 207 | ||
296 | static inline void closure_get(struct closure *cl) | 208 | static inline void closure_set_waiting(struct closure *cl, unsigned long f) |
297 | { | 209 | { |
298 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 210 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
299 | BUG_ON((atomic_inc_return(&cl->remaining) & | 211 | cl->waiting_on = f; |
300 | CLOSURE_REMAINING_MASK) <= 1); | ||
301 | #else | ||
302 | atomic_inc(&cl->remaining); | ||
303 | #endif | 212 | #endif |
304 | } | 213 | } |
305 | 214 | ||
306 | static inline void closure_set_stopped(struct closure *cl) | 215 | static inline void __closure_end_sleep(struct closure *cl) |
307 | { | 216 | { |
308 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); | 217 | __set_current_state(TASK_RUNNING); |
218 | |||
219 | if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) | ||
220 | atomic_sub(CLOSURE_SLEEPING, &cl->remaining); | ||
309 | } | 221 | } |
310 | 222 | ||
311 | static inline bool closure_is_unlocked(struct closure *cl) | 223 | static inline void __closure_start_sleep(struct closure *cl) |
312 | { | 224 | { |
313 | return atomic_read(&cl->remaining) == -1; | 225 | closure_set_ip(cl); |
226 | cl->task = current; | ||
227 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
228 | |||
229 | if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) | ||
230 | atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||
314 | } | 231 | } |
315 | 232 | ||
316 | static inline void do_closure_init(struct closure *cl, struct closure *parent, | 233 | static inline void closure_set_stopped(struct closure *cl) |
317 | bool running) | ||
318 | { | 234 | { |
319 | cl->parent = parent; | 235 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); |
320 | if (parent) | 236 | } |
321 | closure_get(parent); | ||
322 | |||
323 | if (running) { | ||
324 | closure_debug_create(cl); | ||
325 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); | ||
326 | } else | ||
327 | atomic_set(&cl->remaining, -1); | ||
328 | 237 | ||
238 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||
239 | struct workqueue_struct *wq) | ||
240 | { | ||
241 | BUG_ON(object_is_on_stack(cl)); | ||
329 | closure_set_ip(cl); | 242 | closure_set_ip(cl); |
243 | cl->fn = fn; | ||
244 | cl->wq = wq; | ||
245 | /* between atomic_dec() in closure_put() */ | ||
246 | smp_mb__before_atomic_dec(); | ||
330 | } | 247 | } |
331 | 248 | ||
332 | /* | 249 | static inline void closure_queue(struct closure *cl) |
333 | * Hack to get at the embedded closure if there is one, by doing an unsafe cast: | 250 | { |
334 | * the result of __closure_type() is thrown away, it's used merely for type | 251 | struct workqueue_struct *wq = cl->wq; |
335 | * checking. | 252 | if (wq) { |
336 | */ | 253 | INIT_WORK(&cl->work, cl->work.func); |
337 | #define __to_internal_closure(cl) \ | 254 | BUG_ON(!queue_work(wq, &cl->work)); |
338 | ({ \ | 255 | } else |
339 | BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ | 256 | cl->fn(cl); |
340 | (struct closure *) cl; \ | 257 | } |
341 | }) | ||
342 | |||
343 | #define closure_init_type(cl, parent, running) \ | ||
344 | do { \ | ||
345 | struct closure *_cl = __to_internal_closure(cl); \ | ||
346 | _cl->type = __closure_type(*(cl)); \ | ||
347 | do_closure_init(_cl, parent, running); \ | ||
348 | } while (0) | ||
349 | 258 | ||
350 | /** | 259 | /** |
351 | * __closure_init() - Initialize a closure, skipping the memset() | 260 | * closure_get - increment a closure's refcount |
352 | * | ||
353 | * May be used instead of closure_init() when memory has already been zeroed. | ||
354 | */ | 261 | */ |
355 | #define __closure_init(cl, parent) \ | 262 | static inline void closure_get(struct closure *cl) |
356 | closure_init_type(cl, parent, true) | 263 | { |
264 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
265 | BUG_ON((atomic_inc_return(&cl->remaining) & | ||
266 | CLOSURE_REMAINING_MASK) <= 1); | ||
267 | #else | ||
268 | atomic_inc(&cl->remaining); | ||
269 | #endif | ||
270 | } | ||
357 | 271 | ||
358 | /** | 272 | /** |
359 | * closure_init() - Initialize a closure, setting the refcount to 1 | 273 | * closure_init - Initialize a closure, setting the refcount to 1 |
360 | * @cl: closure to initialize | 274 | * @cl: closure to initialize |
361 | * @parent: parent of the new closure. cl will take a refcount on it for its | 275 | * @parent: parent of the new closure. cl will take a refcount on it for its |
362 | * lifetime; may be NULL. | 276 | * lifetime; may be NULL. |
363 | */ | 277 | */ |
364 | #define closure_init(cl, parent) \ | 278 | static inline void closure_init(struct closure *cl, struct closure *parent) |
365 | do { \ | ||
366 | memset((cl), 0, sizeof(*(cl))); \ | ||
367 | __closure_init(cl, parent); \ | ||
368 | } while (0) | ||
369 | |||
370 | static inline void closure_init_stack(struct closure *cl) | ||
371 | { | 279 | { |
372 | memset(cl, 0, sizeof(struct closure)); | 280 | memset(cl, 0, sizeof(struct closure)); |
373 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); | 281 | cl->parent = parent; |
374 | } | 282 | if (parent) |
375 | 283 | closure_get(parent); | |
376 | /** | ||
377 | * closure_init_unlocked() - Initialize a closure but leave it unlocked. | ||
378 | * @cl: closure to initialize | ||
379 | * | ||
380 | * For when the closure will be used as a lock. The closure may not be used | ||
381 | * until after a closure_lock() or closure_trylock(). | ||
382 | */ | ||
383 | #define closure_init_unlocked(cl) \ | ||
384 | do { \ | ||
385 | memset((cl), 0, sizeof(*(cl))); \ | ||
386 | closure_init_type(cl, NULL, false); \ | ||
387 | } while (0) | ||
388 | |||
389 | /** | ||
390 | * closure_lock() - lock and initialize a closure. | ||
391 | * @cl: the closure to lock | ||
392 | * @parent: the new parent for this closure | ||
393 | * | ||
394 | * The closure must be of one of the types that has a waitlist (otherwise we | ||
395 | * wouldn't be able to sleep on contention). | ||
396 | * | ||
397 | * @parent has exactly the same meaning as in closure_init(); if non null, the | ||
398 | * closure will take a reference on @parent which will be released when it is | ||
399 | * unlocked. | ||
400 | */ | ||
401 | #define closure_lock(cl, parent) \ | ||
402 | __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) | ||
403 | 284 | ||
404 | static inline void __closure_end_sleep(struct closure *cl) | 285 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); |
405 | { | ||
406 | __set_current_state(TASK_RUNNING); | ||
407 | 286 | ||
408 | if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) | 287 | closure_debug_create(cl); |
409 | atomic_sub(CLOSURE_SLEEPING, &cl->remaining); | 288 | closure_set_ip(cl); |
410 | } | 289 | } |
411 | 290 | ||
412 | static inline void __closure_start_sleep(struct closure *cl) | 291 | static inline void closure_init_stack(struct closure *cl) |
413 | { | 292 | { |
414 | closure_set_ip(cl); | 293 | memset(cl, 0, sizeof(struct closure)); |
415 | cl->task = current; | 294 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); |
416 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
417 | |||
418 | if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) | ||
419 | atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||
420 | } | 295 | } |
421 | 296 | ||
422 | /** | 297 | /** |
423 | * closure_wake_up() - wake up all closures on a wait list. | 298 | * closure_wake_up - wake up all closures on a wait list. |
424 | */ | 299 | */ |
425 | static inline void closure_wake_up(struct closure_waitlist *list) | 300 | static inline void closure_wake_up(struct closure_waitlist *list) |
426 | { | 301 | { |
@@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list) | |||
428 | __closure_wake_up(list); | 303 | __closure_wake_up(list); |
429 | } | 304 | } |
430 | 305 | ||
431 | /* | 306 | /** |
432 | * Wait on an event, synchronously or asynchronously - analogous to wait_event() | 307 | * continue_at - jump to another function with barrier |
433 | * but for closures. | 308 | * |
434 | * | 309 | * After @cl is no longer waiting on anything (i.e. all outstanding refs have |
435 | * The loop is oddly structured so as to avoid a race; we must check the | 310 | * been dropped with closure_put()), it will resume execution at @fn running out |
436 | * condition again after we've added ourself to the waitlist. We know if we were | 311 | * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). |
437 | * already on the waitlist because closure_wait() returns false; thus, we only | 312 | * |
438 | * schedule or break if closure_wait() returns false. If it returns true, we | 313 | * NOTE: This macro expands to a return in the calling function! |
439 | * just loop again - rechecking the condition. | 314 | * |
440 | * | 315 | * This is because after calling continue_at() you no longer have a ref on @cl, |
441 | * The __closure_wake_up() is necessary because we may race with the event | 316 | * and whatever @cl owns may be freed out from under you - a running closure fn |
442 | * becoming true; i.e. we see event false -> wait -> recheck condition, but the | 317 | * has a ref on its own closure which continue_at() drops. |
443 | * thread that made the event true may have called closure_wake_up() before we | ||
444 | * added ourself to the wait list. | ||
445 | * | ||
446 | * We have to call closure_sync() at the end instead of just | ||
447 | * __closure_end_sleep() because a different thread might've called | ||
448 | * closure_wake_up() before us and gotten preempted before they dropped the | ||
449 | * refcount on our closure. If this was a stack allocated closure, that would be | ||
450 | * bad. | ||
451 | */ | 318 | */ |
452 | #define closure_wait_event(list, cl, condition) \ | ||
453 | ({ \ | ||
454 | typeof(condition) ret; \ | ||
455 | \ | ||
456 | while (1) { \ | ||
457 | ret = (condition); \ | ||
458 | if (ret) { \ | ||
459 | __closure_wake_up(list); \ | ||
460 | closure_sync(cl); \ | ||
461 | break; \ | ||
462 | } \ | ||
463 | \ | ||
464 | __closure_start_sleep(cl); \ | ||
465 | \ | ||
466 | if (!closure_wait(list, cl)) \ | ||
467 | schedule(); \ | ||
468 | } \ | ||
469 | \ | ||
470 | ret; \ | ||
471 | }) | ||
472 | |||
473 | static inline void closure_queue(struct closure *cl) | ||
474 | { | ||
475 | struct workqueue_struct *wq = cl->wq; | ||
476 | if (wq) { | ||
477 | INIT_WORK(&cl->work, cl->work.func); | ||
478 | BUG_ON(!queue_work(wq, &cl->work)); | ||
479 | } else | ||
480 | cl->fn(cl); | ||
481 | } | ||
482 | |||
483 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||
484 | struct workqueue_struct *wq) | ||
485 | { | ||
486 | BUG_ON(object_is_on_stack(cl)); | ||
487 | closure_set_ip(cl); | ||
488 | cl->fn = fn; | ||
489 | cl->wq = wq; | ||
490 | /* between atomic_dec() in closure_put() */ | ||
491 | smp_mb__before_atomic_dec(); | ||
492 | } | ||
493 | |||
494 | #define continue_at(_cl, _fn, _wq) \ | 319 | #define continue_at(_cl, _fn, _wq) \ |
495 | do { \ | 320 | do { \ |
496 | set_closure_fn(_cl, _fn, _wq); \ | 321 | set_closure_fn(_cl, _fn, _wq); \ |
@@ -498,8 +323,28 @@ do { \ | |||
498 | return; \ | 323 | return; \ |
499 | } while (0) | 324 | } while (0) |
500 | 325 | ||
326 | /** | ||
327 | * closure_return - finish execution of a closure | ||
328 | * | ||
329 | * This is used to indicate that @cl is finished: when all outstanding refs on | ||
330 | * @cl have been dropped @cl's ref on its parent closure (as passed to | ||
331 | * closure_init()) will be dropped, if one was specified - thus this can be | ||
332 | * thought of as returning to the parent closure. | ||
333 | */ | ||
501 | #define closure_return(_cl) continue_at((_cl), NULL, NULL) | 334 | #define closure_return(_cl) continue_at((_cl), NULL, NULL) |
502 | 335 | ||
336 | /** | ||
337 | * continue_at_nobarrier - jump to another function without barrier | ||
338 | * | ||
339 | * Causes @fn to be executed out of @cl, in @wq context (or called directly if | ||
340 | * @wq is NULL). | ||
341 | * | ||
342 | * NOTE: like continue_at(), this macro expands to a return in the caller! | ||
343 | * | ||
344 | * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, | ||
345 | * thus it's not safe to touch anything protected by @cl after a | ||
346 | * continue_at_nobarrier(). | ||
347 | */ | ||
503 | #define continue_at_nobarrier(_cl, _fn, _wq) \ | 348 | #define continue_at_nobarrier(_cl, _fn, _wq) \ |
504 | do { \ | 349 | do { \ |
505 | set_closure_fn(_cl, _fn, _wq); \ | 350 | set_closure_fn(_cl, _fn, _wq); \ |
@@ -507,6 +352,15 @@ do { \ | |||
507 | return; \ | 352 | return; \ |
508 | } while (0) | 353 | } while (0) |
509 | 354 | ||
355 | /** | ||
356 | * closure_return - finish execution of a closure, with destructor | ||
357 | * | ||
358 | * Works like closure_return(), except @destructor will be called when all | ||
359 | * outstanding refs on @cl have been dropped; @destructor may be used to safely | ||
360 | * free the memory occupied by @cl, and it is called with the ref on the parent | ||
361 | * closure still held - so @destructor could safely return an item to a | ||
362 | * freelist protected by @cl's parent. | ||
363 | */ | ||
510 | #define closure_return_with_destructor(_cl, _destructor) \ | 364 | #define closure_return_with_destructor(_cl, _destructor) \ |
511 | do { \ | 365 | do { \ |
512 | set_closure_fn(_cl, _destructor, NULL); \ | 366 | set_closure_fn(_cl, _destructor, NULL); \ |
@@ -514,6 +368,13 @@ do { \ | |||
514 | return; \ | 368 | return; \ |
515 | } while (0) | 369 | } while (0) |
516 | 370 | ||
371 | /** | ||
372 | * closure_call - execute @fn out of a new, uninitialized closure | ||
373 | * | ||
374 | * Typically used when running out of one closure, and we want to run @fn | ||
375 | * asynchronously out of a new closure - @parent will then wait for @cl to | ||
376 | * finish. | ||
377 | */ | ||
517 | static inline void closure_call(struct closure *cl, closure_fn fn, | 378 | static inline void closure_call(struct closure *cl, closure_fn fn, |
518 | struct workqueue_struct *wq, | 379 | struct workqueue_struct *wq, |
519 | struct closure *parent) | 380 | struct closure *parent) |
@@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn, | |||
522 | continue_at_nobarrier(cl, fn, wq); | 383 | continue_at_nobarrier(cl, fn, wq); |
523 | } | 384 | } |
524 | 385 | ||
525 | static inline void closure_trylock_call(struct closure *cl, closure_fn fn, | ||
526 | struct workqueue_struct *wq, | ||
527 | struct closure *parent) | ||
528 | { | ||
529 | if (closure_trylock(cl, parent)) | ||
530 | continue_at_nobarrier(cl, fn, wq); | ||
531 | } | ||
532 | |||
533 | #endif /* _LINUX_CLOSURE_H */ | 386 | #endif /* _LINUX_CLOSURE_H */ |
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 264fcfbd6290..8b1f1d5c1819 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include "bcache.h" | 8 | #include "bcache.h" |
9 | #include "btree.h" | 9 | #include "btree.h" |
10 | #include "debug.h" | 10 | #include "debug.h" |
11 | #include "extents.h" | ||
11 | 12 | ||
12 | #include <linux/console.h> | 13 | #include <linux/console.h> |
13 | #include <linux/debugfs.h> | 14 | #include <linux/debugfs.h> |
@@ -17,163 +18,96 @@ | |||
17 | 18 | ||
18 | static struct dentry *debug; | 19 | static struct dentry *debug; |
19 | 20 | ||
20 | const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||
21 | { | ||
22 | unsigned i; | ||
23 | |||
24 | for (i = 0; i < KEY_PTRS(k); i++) | ||
25 | if (ptr_available(c, k, i)) { | ||
26 | struct cache *ca = PTR_CACHE(c, k, i); | ||
27 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
28 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
29 | |||
30 | if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||
31 | return "bad, length too big"; | ||
32 | if (bucket < ca->sb.first_bucket) | ||
33 | return "bad, short offset"; | ||
34 | if (bucket >= ca->sb.nbuckets) | ||
35 | return "bad, offset past end of device"; | ||
36 | if (ptr_stale(c, k, i)) | ||
37 | return "stale"; | ||
38 | } | ||
39 | |||
40 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
41 | return "bad, null key"; | ||
42 | if (!KEY_PTRS(k)) | ||
43 | return "bad, no pointers"; | ||
44 | if (!KEY_SIZE(k)) | ||
45 | return "zeroed key"; | ||
46 | return ""; | ||
47 | } | ||
48 | |||
49 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) | ||
50 | { | ||
51 | unsigned i = 0; | ||
52 | char *out = buf, *end = buf + size; | ||
53 | |||
54 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | ||
55 | |||
56 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); | ||
57 | |||
58 | if (KEY_PTRS(k)) | ||
59 | while (1) { | ||
60 | p("%llu:%llu gen %llu", | ||
61 | PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||
62 | |||
63 | if (++i == KEY_PTRS(k)) | ||
64 | break; | ||
65 | |||
66 | p(", "); | ||
67 | } | ||
68 | |||
69 | p("]"); | ||
70 | |||
71 | if (KEY_DIRTY(k)) | ||
72 | p(" dirty"); | ||
73 | if (KEY_CSUM(k)) | ||
74 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||
75 | #undef p | ||
76 | return out - buf; | ||
77 | } | ||
78 | |||
79 | #ifdef CONFIG_BCACHE_DEBUG | 21 | #ifdef CONFIG_BCACHE_DEBUG |
80 | 22 | ||
81 | static void dump_bset(struct btree *b, struct bset *i) | 23 | #define for_each_written_bset(b, start, i) \ |
82 | { | 24 | for (i = (start); \ |
83 | struct bkey *k, *next; | 25 | (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ |
84 | unsigned j; | 26 | i->seq == (start)->seq; \ |
85 | char buf[80]; | 27 | i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ |
86 | 28 | block_bytes(b->c)) | |
87 | for (k = i->start; k < end(i); k = next) { | ||
88 | next = bkey_next(k); | ||
89 | |||
90 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
91 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | ||
92 | (uint64_t *) k - i->d, i->keys, buf); | ||
93 | |||
94 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
95 | size_t n = PTR_BUCKET_NR(b->c, k, j); | ||
96 | printk(" bucket %zu", n); | ||
97 | |||
98 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||
99 | printk(" prio %i", | ||
100 | PTR_BUCKET(b->c, k, j)->prio); | ||
101 | } | ||
102 | 29 | ||
103 | printk(" %s\n", bch_ptr_status(b->c, k)); | 30 | void bch_btree_verify(struct btree *b) |
104 | |||
105 | if (next < end(i) && | ||
106 | bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) | ||
107 | printk(KERN_ERR "Key skipped backwards\n"); | ||
108 | } | ||
109 | } | ||
110 | |||
111 | static void bch_dump_bucket(struct btree *b) | ||
112 | { | ||
113 | unsigned i; | ||
114 | |||
115 | console_lock(); | ||
116 | for (i = 0; i <= b->nsets; i++) | ||
117 | dump_bset(b, b->sets[i].data); | ||
118 | console_unlock(); | ||
119 | } | ||
120 | |||
121 | void bch_btree_verify(struct btree *b, struct bset *new) | ||
122 | { | 31 | { |
123 | struct btree *v = b->c->verify_data; | 32 | struct btree *v = b->c->verify_data; |
124 | struct closure cl; | 33 | struct bset *ondisk, *sorted, *inmemory; |
125 | closure_init_stack(&cl); | 34 | struct bio *bio; |
126 | 35 | ||
127 | if (!b->c->verify) | 36 | if (!b->c->verify || !b->c->verify_ondisk) |
128 | return; | 37 | return; |
129 | 38 | ||
130 | closure_wait_event(&b->io.wait, &cl, | 39 | down(&b->io_mutex); |
131 | atomic_read(&b->io.cl.remaining) == -1); | ||
132 | |||
133 | mutex_lock(&b->c->verify_lock); | 40 | mutex_lock(&b->c->verify_lock); |
134 | 41 | ||
42 | ondisk = b->c->verify_ondisk; | ||
43 | sorted = b->c->verify_data->keys.set->data; | ||
44 | inmemory = b->keys.set->data; | ||
45 | |||
135 | bkey_copy(&v->key, &b->key); | 46 | bkey_copy(&v->key, &b->key); |
136 | v->written = 0; | 47 | v->written = 0; |
137 | v->level = b->level; | 48 | v->level = b->level; |
49 | v->keys.ops = b->keys.ops; | ||
50 | |||
51 | bio = bch_bbio_alloc(b->c); | ||
52 | bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; | ||
53 | bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); | ||
54 | bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; | ||
55 | bch_bio_map(bio, sorted); | ||
138 | 56 | ||
139 | bch_btree_node_read(v); | 57 | submit_bio_wait(REQ_META|READ_SYNC, bio); |
140 | closure_wait_event(&v->io.wait, &cl, | 58 | bch_bbio_free(bio, b->c); |
141 | atomic_read(&b->io.cl.remaining) == -1); | ||
142 | 59 | ||
143 | if (new->keys != v->sets[0].data->keys || | 60 | memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); |
144 | memcmp(new->start, | 61 | |
145 | v->sets[0].data->start, | 62 | bch_btree_node_read_done(v); |
146 | (void *) end(new) - (void *) new->start)) { | 63 | sorted = v->keys.set->data; |
147 | unsigned i, j; | 64 | |
65 | if (inmemory->keys != sorted->keys || | ||
66 | memcmp(inmemory->start, | ||
67 | sorted->start, | ||
68 | (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { | ||
69 | struct bset *i; | ||
70 | unsigned j; | ||
148 | 71 | ||
149 | console_lock(); | 72 | console_lock(); |
150 | 73 | ||
151 | printk(KERN_ERR "*** original memory node:\n"); | 74 | printk(KERN_ERR "*** in memory:\n"); |
152 | for (i = 0; i <= b->nsets; i++) | 75 | bch_dump_bset(&b->keys, inmemory, 0); |
153 | dump_bset(b, b->sets[i].data); | ||
154 | 76 | ||
155 | printk(KERN_ERR "*** sorted memory node:\n"); | 77 | printk(KERN_ERR "*** read back in:\n"); |
156 | dump_bset(b, new); | 78 | bch_dump_bset(&v->keys, sorted, 0); |
157 | 79 | ||
158 | printk(KERN_ERR "*** on disk node:\n"); | 80 | for_each_written_bset(b, ondisk, i) { |
159 | dump_bset(v, v->sets[0].data); | 81 | unsigned block = ((void *) i - (void *) ondisk) / |
82 | block_bytes(b->c); | ||
83 | |||
84 | printk(KERN_ERR "*** on disk block %u:\n", block); | ||
85 | bch_dump_bset(&b->keys, i, block); | ||
86 | } | ||
160 | 87 | ||
161 | for (j = 0; j < new->keys; j++) | 88 | printk(KERN_ERR "*** block %zu not written\n", |
162 | if (new->d[j] != v->sets[0].data->d[j]) | 89 | ((void *) i - (void *) ondisk) / block_bytes(b->c)); |
90 | |||
91 | for (j = 0; j < inmemory->keys; j++) | ||
92 | if (inmemory->d[j] != sorted->d[j]) | ||
163 | break; | 93 | break; |
164 | 94 | ||
95 | printk(KERN_ERR "b->written %u\n", b->written); | ||
96 | |||
165 | console_unlock(); | 97 | console_unlock(); |
166 | panic("verify failed at %u\n", j); | 98 | panic("verify failed at %u\n", j); |
167 | } | 99 | } |
168 | 100 | ||
169 | mutex_unlock(&b->c->verify_lock); | 101 | mutex_unlock(&b->c->verify_lock); |
102 | up(&b->io_mutex); | ||
170 | } | 103 | } |
171 | 104 | ||
172 | void bch_data_verify(struct cached_dev *dc, struct bio *bio) | 105 | void bch_data_verify(struct cached_dev *dc, struct bio *bio) |
173 | { | 106 | { |
174 | char name[BDEVNAME_SIZE]; | 107 | char name[BDEVNAME_SIZE]; |
175 | struct bio *check; | 108 | struct bio *check; |
176 | struct bio_vec *bv; | 109 | struct bio_vec bv, *bv2; |
110 | struct bvec_iter iter; | ||
177 | int i; | 111 | int i; |
178 | 112 | ||
179 | check = bio_clone(bio, GFP_NOIO); | 113 | check = bio_clone(bio, GFP_NOIO); |
@@ -185,95 +119,27 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) | |||
185 | 119 | ||
186 | submit_bio_wait(READ_SYNC, check); | 120 | submit_bio_wait(READ_SYNC, check); |
187 | 121 | ||
188 | bio_for_each_segment(bv, bio, i) { | 122 | bio_for_each_segment(bv, bio, iter) { |
189 | void *p1 = kmap_atomic(bv->bv_page); | 123 | void *p1 = kmap_atomic(bv.bv_page); |
190 | void *p2 = page_address(check->bi_io_vec[i].bv_page); | 124 | void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page); |
191 | 125 | ||
192 | cache_set_err_on(memcmp(p1 + bv->bv_offset, | 126 | cache_set_err_on(memcmp(p1 + bv.bv_offset, |
193 | p2 + bv->bv_offset, | 127 | p2 + bv.bv_offset, |
194 | bv->bv_len), | 128 | bv.bv_len), |
195 | dc->disk.c, | 129 | dc->disk.c, |
196 | "verify failed at dev %s sector %llu", | 130 | "verify failed at dev %s sector %llu", |
197 | bdevname(dc->bdev, name), | 131 | bdevname(dc->bdev, name), |
198 | (uint64_t) bio->bi_sector); | 132 | (uint64_t) bio->bi_iter.bi_sector); |
199 | 133 | ||
200 | kunmap_atomic(p1); | 134 | kunmap_atomic(p1); |
201 | } | 135 | } |
202 | 136 | ||
203 | bio_for_each_segment_all(bv, check, i) | 137 | bio_for_each_segment_all(bv2, check, i) |
204 | __free_page(bv->bv_page); | 138 | __free_page(bv2->bv_page); |
205 | out_put: | 139 | out_put: |
206 | bio_put(check); | 140 | bio_put(check); |
207 | } | 141 | } |
208 | 142 | ||
209 | int __bch_count_data(struct btree *b) | ||
210 | { | ||
211 | unsigned ret = 0; | ||
212 | struct btree_iter iter; | ||
213 | struct bkey *k; | ||
214 | |||
215 | if (!b->level) | ||
216 | for_each_key(b, k, &iter) | ||
217 | ret += KEY_SIZE(k); | ||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | void __bch_check_keys(struct btree *b, const char *fmt, ...) | ||
222 | { | ||
223 | va_list args; | ||
224 | struct bkey *k, *p = NULL; | ||
225 | struct btree_iter iter; | ||
226 | const char *err; | ||
227 | |||
228 | for_each_key(b, k, &iter) { | ||
229 | if (!b->level) { | ||
230 | err = "Keys out of order"; | ||
231 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) | ||
232 | goto bug; | ||
233 | |||
234 | if (bch_ptr_invalid(b, k)) | ||
235 | continue; | ||
236 | |||
237 | err = "Overlapping keys"; | ||
238 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) | ||
239 | goto bug; | ||
240 | } else { | ||
241 | if (bch_ptr_bad(b, k)) | ||
242 | continue; | ||
243 | |||
244 | err = "Duplicate keys"; | ||
245 | if (p && !bkey_cmp(p, k)) | ||
246 | goto bug; | ||
247 | } | ||
248 | p = k; | ||
249 | } | ||
250 | |||
251 | err = "Key larger than btree node key"; | ||
252 | if (p && bkey_cmp(p, &b->key) > 0) | ||
253 | goto bug; | ||
254 | |||
255 | return; | ||
256 | bug: | ||
257 | bch_dump_bucket(b); | ||
258 | |||
259 | va_start(args, fmt); | ||
260 | vprintk(fmt, args); | ||
261 | va_end(args); | ||
262 | |||
263 | panic("bcache error: %s:\n", err); | ||
264 | } | ||
265 | |||
266 | void bch_btree_iter_next_check(struct btree_iter *iter) | ||
267 | { | ||
268 | struct bkey *k = iter->data->k, *next = bkey_next(k); | ||
269 | |||
270 | if (next < iter->data->end && | ||
271 | bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { | ||
272 | bch_dump_bucket(iter->b); | ||
273 | panic("Key skipped backwards\n"); | ||
274 | } | ||
275 | } | ||
276 | |||
277 | #endif | 143 | #endif |
278 | 144 | ||
279 | #ifdef CONFIG_DEBUG_FS | 145 | #ifdef CONFIG_DEBUG_FS |
@@ -320,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, | |||
320 | if (!w) | 186 | if (!w) |
321 | break; | 187 | break; |
322 | 188 | ||
323 | bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); | 189 | bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); |
324 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); | 190 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); |
325 | bch_keybuf_del(&i->keys, w); | 191 | bch_keybuf_del(&i->keys, w); |
326 | } | 192 | } |
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 2ede60e31874..1f63c195d247 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h | |||
@@ -1,47 +1,30 @@ | |||
1 | #ifndef _BCACHE_DEBUG_H | 1 | #ifndef _BCACHE_DEBUG_H |
2 | #define _BCACHE_DEBUG_H | 2 | #define _BCACHE_DEBUG_H |
3 | 3 | ||
4 | /* Btree/bkey debug printing */ | 4 | struct bio; |
5 | 5 | struct cached_dev; | |
6 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); | 6 | struct cache_set; |
7 | 7 | ||
8 | #ifdef CONFIG_BCACHE_DEBUG | 8 | #ifdef CONFIG_BCACHE_DEBUG |
9 | 9 | ||
10 | void bch_btree_verify(struct btree *, struct bset *); | 10 | void bch_btree_verify(struct btree *); |
11 | void bch_data_verify(struct cached_dev *, struct bio *); | 11 | void bch_data_verify(struct cached_dev *, struct bio *); |
12 | int __bch_count_data(struct btree *); | ||
13 | void __bch_check_keys(struct btree *, const char *, ...); | ||
14 | void bch_btree_iter_next_check(struct btree_iter *); | ||
15 | 12 | ||
16 | #define EBUG_ON(cond) BUG_ON(cond) | ||
17 | #define expensive_debug_checks(c) ((c)->expensive_debug_checks) | 13 | #define expensive_debug_checks(c) ((c)->expensive_debug_checks) |
18 | #define key_merging_disabled(c) ((c)->key_merging_disabled) | 14 | #define key_merging_disabled(c) ((c)->key_merging_disabled) |
19 | #define bypass_torture_test(d) ((d)->bypass_torture_test) | 15 | #define bypass_torture_test(d) ((d)->bypass_torture_test) |
20 | 16 | ||
21 | #else /* DEBUG */ | 17 | #else /* DEBUG */ |
22 | 18 | ||
23 | static inline void bch_btree_verify(struct btree *b, struct bset *i) {} | 19 | static inline void bch_btree_verify(struct btree *b) {} |
24 | static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} | 20 | static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} |
25 | static inline int __bch_count_data(struct btree *b) { return -1; } | ||
26 | static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} | ||
27 | static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} | ||
28 | 21 | ||
29 | #define EBUG_ON(cond) do { if (cond); } while (0) | ||
30 | #define expensive_debug_checks(c) 0 | 22 | #define expensive_debug_checks(c) 0 |
31 | #define key_merging_disabled(c) 0 | 23 | #define key_merging_disabled(c) 0 |
32 | #define bypass_torture_test(d) 0 | 24 | #define bypass_torture_test(d) 0 |
33 | 25 | ||
34 | #endif | 26 | #endif |
35 | 27 | ||
36 | #define bch_count_data(b) \ | ||
37 | (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) | ||
38 | |||
39 | #define bch_check_keys(b, ...) \ | ||
40 | do { \ | ||
41 | if (expensive_debug_checks((b)->c)) \ | ||
42 | __bch_check_keys(b, __VA_ARGS__); \ | ||
43 | } while (0) | ||
44 | |||
45 | #ifdef CONFIG_DEBUG_FS | 28 | #ifdef CONFIG_DEBUG_FS |
46 | void bch_debug_init_cache_set(struct cache_set *); | 29 | void bch_debug_init_cache_set(struct cache_set *); |
47 | #else | 30 | #else |
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c new file mode 100644 index 000000000000..416d1a3e028e --- /dev/null +++ b/drivers/md/bcache/extents.c | |||
@@ -0,0 +1,616 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | ||
3 | * | ||
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | ||
5 | * All allocation is done in buckets, which should match the erase block size | ||
6 | * of the device. | ||
7 | * | ||
8 | * Buckets containing cached data are kept on a heap sorted by priority; | ||
9 | * bucket priority is increased on cache hit, and periodically all the buckets | ||
10 | * on the heap have their priority scaled down. This currently is just used as | ||
11 | * an LRU but in the future should allow for more intelligent heuristics. | ||
12 | * | ||
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | ||
14 | * counter. Garbage collection is used to remove stale pointers. | ||
15 | * | ||
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | ||
17 | * as keys are inserted we only sort the pages that have not yet been written. | ||
18 | * When garbage collection is run, we resort the entire node. | ||
19 | * | ||
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | ||
21 | */ | ||
22 | |||
23 | #include "bcache.h" | ||
24 | #include "btree.h" | ||
25 | #include "debug.h" | ||
26 | #include "extents.h" | ||
27 | #include "writeback.h" | ||
28 | |||
29 | static void sort_key_next(struct btree_iter *iter, | ||
30 | struct btree_iter_set *i) | ||
31 | { | ||
32 | i->k = bkey_next(i->k); | ||
33 | |||
34 | if (i->k == i->end) | ||
35 | *i = iter->data[--iter->used]; | ||
36 | } | ||
37 | |||
38 | static bool bch_key_sort_cmp(struct btree_iter_set l, | ||
39 | struct btree_iter_set r) | ||
40 | { | ||
41 | int64_t c = bkey_cmp(l.k, r.k); | ||
42 | |||
43 | return c ? c > 0 : l.k < r.k; | ||
44 | } | ||
45 | |||
46 | static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
47 | { | ||
48 | unsigned i; | ||
49 | |||
50 | for (i = 0; i < KEY_PTRS(k); i++) | ||
51 | if (ptr_available(c, k, i)) { | ||
52 | struct cache *ca = PTR_CACHE(c, k, i); | ||
53 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
54 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
55 | |||
56 | if (KEY_SIZE(k) + r > c->sb.bucket_size || | ||
57 | bucket < ca->sb.first_bucket || | ||
58 | bucket >= ca->sb.nbuckets) | ||
59 | return true; | ||
60 | } | ||
61 | |||
62 | return false; | ||
63 | } | ||
64 | |||
65 | /* Common among btree and extent ptrs */ | ||
66 | |||
67 | static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||
68 | { | ||
69 | unsigned i; | ||
70 | |||
71 | for (i = 0; i < KEY_PTRS(k); i++) | ||
72 | if (ptr_available(c, k, i)) { | ||
73 | struct cache *ca = PTR_CACHE(c, k, i); | ||
74 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
75 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
76 | |||
77 | if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||
78 | return "bad, length too big"; | ||
79 | if (bucket < ca->sb.first_bucket) | ||
80 | return "bad, short offset"; | ||
81 | if (bucket >= ca->sb.nbuckets) | ||
82 | return "bad, offset past end of device"; | ||
83 | if (ptr_stale(c, k, i)) | ||
84 | return "stale"; | ||
85 | } | ||
86 | |||
87 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
88 | return "bad, null key"; | ||
89 | if (!KEY_PTRS(k)) | ||
90 | return "bad, no pointers"; | ||
91 | if (!KEY_SIZE(k)) | ||
92 | return "zeroed key"; | ||
93 | return ""; | ||
94 | } | ||
95 | |||
96 | void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) | ||
97 | { | ||
98 | unsigned i = 0; | ||
99 | char *out = buf, *end = buf + size; | ||
100 | |||
101 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | ||
102 | |||
103 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); | ||
104 | |||
105 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
106 | if (i) | ||
107 | p(", "); | ||
108 | |||
109 | if (PTR_DEV(k, i) == PTR_CHECK_DEV) | ||
110 | p("check dev"); | ||
111 | else | ||
112 | p("%llu:%llu gen %llu", PTR_DEV(k, i), | ||
113 | PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||
114 | } | ||
115 | |||
116 | p("]"); | ||
117 | |||
118 | if (KEY_DIRTY(k)) | ||
119 | p(" dirty"); | ||
120 | if (KEY_CSUM(k)) | ||
121 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||
122 | #undef p | ||
123 | } | ||
124 | |||
125 | static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) | ||
126 | { | ||
127 | struct btree *b = container_of(keys, struct btree, keys); | ||
128 | unsigned j; | ||
129 | char buf[80]; | ||
130 | |||
131 | bch_extent_to_text(buf, sizeof(buf), k); | ||
132 | printk(" %s", buf); | ||
133 | |||
134 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
135 | size_t n = PTR_BUCKET_NR(b->c, k, j); | ||
136 | printk(" bucket %zu", n); | ||
137 | |||
138 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||
139 | printk(" prio %i", | ||
140 | PTR_BUCKET(b->c, k, j)->prio); | ||
141 | } | ||
142 | |||
143 | printk(" %s\n", bch_ptr_status(b->c, k)); | ||
144 | } | ||
145 | |||
146 | /* Btree ptrs */ | ||
147 | |||
148 | bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
149 | { | ||
150 | char buf[80]; | ||
151 | |||
152 | if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) | ||
153 | goto bad; | ||
154 | |||
155 | if (__ptr_invalid(c, k)) | ||
156 | goto bad; | ||
157 | |||
158 | return false; | ||
159 | bad: | ||
160 | bch_extent_to_text(buf, sizeof(buf), k); | ||
161 | cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); | ||
162 | return true; | ||
163 | } | ||
164 | |||
165 | static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) | ||
166 | { | ||
167 | struct btree *b = container_of(bk, struct btree, keys); | ||
168 | return __bch_btree_ptr_invalid(b->c, k); | ||
169 | } | ||
170 | |||
171 | static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) | ||
172 | { | ||
173 | unsigned i; | ||
174 | char buf[80]; | ||
175 | struct bucket *g; | ||
176 | |||
177 | if (mutex_trylock(&b->c->bucket_lock)) { | ||
178 | for (i = 0; i < KEY_PTRS(k); i++) | ||
179 | if (ptr_available(b->c, k, i)) { | ||
180 | g = PTR_BUCKET(b->c, k, i); | ||
181 | |||
182 | if (KEY_DIRTY(k) || | ||
183 | g->prio != BTREE_PRIO || | ||
184 | (b->c->gc_mark_valid && | ||
185 | GC_MARK(g) != GC_MARK_METADATA)) | ||
186 | goto err; | ||
187 | } | ||
188 | |||
189 | mutex_unlock(&b->c->bucket_lock); | ||
190 | } | ||
191 | |||
192 | return false; | ||
193 | err: | ||
194 | mutex_unlock(&b->c->bucket_lock); | ||
195 | bch_extent_to_text(buf, sizeof(buf), k); | ||
196 | btree_bug(b, | ||
197 | "inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
198 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | ||
199 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
200 | return true; | ||
201 | } | ||
202 | |||
203 | static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) | ||
204 | { | ||
205 | struct btree *b = container_of(bk, struct btree, keys); | ||
206 | unsigned i; | ||
207 | |||
208 | if (!bkey_cmp(k, &ZERO_KEY) || | ||
209 | !KEY_PTRS(k) || | ||
210 | bch_ptr_invalid(bk, k)) | ||
211 | return true; | ||
212 | |||
213 | for (i = 0; i < KEY_PTRS(k); i++) | ||
214 | if (!ptr_available(b->c, k, i) || | ||
215 | ptr_stale(b->c, k, i)) | ||
216 | return true; | ||
217 | |||
218 | if (expensive_debug_checks(b->c) && | ||
219 | btree_ptr_bad_expensive(b, k)) | ||
220 | return true; | ||
221 | |||
222 | return false; | ||
223 | } | ||
224 | |||
225 | static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, | ||
226 | struct bkey *insert, | ||
227 | struct btree_iter *iter, | ||
228 | struct bkey *replace_key) | ||
229 | { | ||
230 | struct btree *b = container_of(bk, struct btree, keys); | ||
231 | |||
232 | if (!KEY_OFFSET(insert)) | ||
233 | btree_current_write(b)->prio_blocked++; | ||
234 | |||
235 | return false; | ||
236 | } | ||
237 | |||
238 | const struct btree_keys_ops bch_btree_keys_ops = { | ||
239 | .sort_cmp = bch_key_sort_cmp, | ||
240 | .insert_fixup = bch_btree_ptr_insert_fixup, | ||
241 | .key_invalid = bch_btree_ptr_invalid, | ||
242 | .key_bad = bch_btree_ptr_bad, | ||
243 | .key_to_text = bch_extent_to_text, | ||
244 | .key_dump = bch_bkey_dump, | ||
245 | }; | ||
246 | |||
247 | /* Extents */ | ||
248 | |||
249 | /* | ||
250 | * Returns true if l > r - unless l == r, in which case returns true if l is | ||
251 | * older than r. | ||
252 | * | ||
253 | * Necessary for btree_sort_fixup() - if there are multiple keys that compare | ||
254 | * equal in different sets, we have to process them newest to oldest. | ||
255 | */ | ||
256 | static bool bch_extent_sort_cmp(struct btree_iter_set l, | ||
257 | struct btree_iter_set r) | ||
258 | { | ||
259 | int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); | ||
260 | |||
261 | return c ? c > 0 : l.k < r.k; | ||
262 | } | ||
263 | |||
264 | static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, | ||
265 | struct bkey *tmp) | ||
266 | { | ||
267 | while (iter->used > 1) { | ||
268 | struct btree_iter_set *top = iter->data, *i = top + 1; | ||
269 | |||
270 | if (iter->used > 2 && | ||
271 | bch_extent_sort_cmp(i[0], i[1])) | ||
272 | i++; | ||
273 | |||
274 | if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) | ||
275 | break; | ||
276 | |||
277 | if (!KEY_SIZE(i->k)) { | ||
278 | sort_key_next(iter, i); | ||
279 | heap_sift(iter, i - top, bch_extent_sort_cmp); | ||
280 | continue; | ||
281 | } | ||
282 | |||
283 | if (top->k > i->k) { | ||
284 | if (bkey_cmp(top->k, i->k) >= 0) | ||
285 | sort_key_next(iter, i); | ||
286 | else | ||
287 | bch_cut_front(top->k, i->k); | ||
288 | |||
289 | heap_sift(iter, i - top, bch_extent_sort_cmp); | ||
290 | } else { | ||
291 | /* can't happen because of comparison func */ | ||
292 | BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); | ||
293 | |||
294 | if (bkey_cmp(i->k, top->k) < 0) { | ||
295 | bkey_copy(tmp, top->k); | ||
296 | |||
297 | bch_cut_back(&START_KEY(i->k), tmp); | ||
298 | bch_cut_front(i->k, top->k); | ||
299 | heap_sift(iter, 0, bch_extent_sort_cmp); | ||
300 | |||
301 | return tmp; | ||
302 | } else { | ||
303 | bch_cut_back(&START_KEY(i->k), top->k); | ||
304 | } | ||
305 | } | ||
306 | } | ||
307 | |||
308 | return NULL; | ||
309 | } | ||
310 | |||
311 | static bool bch_extent_insert_fixup(struct btree_keys *b, | ||
312 | struct bkey *insert, | ||
313 | struct btree_iter *iter, | ||
314 | struct bkey *replace_key) | ||
315 | { | ||
316 | struct cache_set *c = container_of(b, struct btree, keys)->c; | ||
317 | |||
318 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | ||
319 | { | ||
320 | if (KEY_DIRTY(k)) | ||
321 | bcache_dev_sectors_dirty_add(c, KEY_INODE(k), | ||
322 | offset, -sectors); | ||
323 | } | ||
324 | |||
325 | uint64_t old_offset; | ||
326 | unsigned old_size, sectors_found = 0; | ||
327 | |||
328 | BUG_ON(!KEY_OFFSET(insert)); | ||
329 | BUG_ON(!KEY_SIZE(insert)); | ||
330 | |||
331 | while (1) { | ||
332 | struct bkey *k = bch_btree_iter_next(iter); | ||
333 | if (!k) | ||
334 | break; | ||
335 | |||
336 | if (bkey_cmp(&START_KEY(k), insert) >= 0) { | ||
337 | if (KEY_SIZE(k)) | ||
338 | break; | ||
339 | else | ||
340 | continue; | ||
341 | } | ||
342 | |||
343 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | ||
344 | continue; | ||
345 | |||
346 | old_offset = KEY_START(k); | ||
347 | old_size = KEY_SIZE(k); | ||
348 | |||
349 | /* | ||
350 | * We might overlap with 0 size extents; we can't skip these | ||
351 | * because if they're in the set we're inserting to we have to | ||
352 | * adjust them so they don't overlap with the key we're | ||
353 | * inserting. But we don't want to check them for replace | ||
354 | * operations. | ||
355 | */ | ||
356 | |||
357 | if (replace_key && KEY_SIZE(k)) { | ||
358 | /* | ||
359 | * k might have been split since we inserted/found the | ||
360 | * key we're replacing | ||
361 | */ | ||
362 | unsigned i; | ||
363 | uint64_t offset = KEY_START(k) - | ||
364 | KEY_START(replace_key); | ||
365 | |||
366 | /* But it must be a subset of the replace key */ | ||
367 | if (KEY_START(k) < KEY_START(replace_key) || | ||
368 | KEY_OFFSET(k) > KEY_OFFSET(replace_key)) | ||
369 | goto check_failed; | ||
370 | |||
371 | /* We didn't find a key that we were supposed to */ | ||
372 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | ||
373 | goto check_failed; | ||
374 | |||
375 | if (!bch_bkey_equal_header(k, replace_key)) | ||
376 | goto check_failed; | ||
377 | |||
378 | /* skip past gen */ | ||
379 | offset <<= 8; | ||
380 | |||
381 | BUG_ON(!KEY_PTRS(replace_key)); | ||
382 | |||
383 | for (i = 0; i < KEY_PTRS(replace_key); i++) | ||
384 | if (k->ptr[i] != replace_key->ptr[i] + offset) | ||
385 | goto check_failed; | ||
386 | |||
387 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | ||
388 | } | ||
389 | |||
390 | if (bkey_cmp(insert, k) < 0 && | ||
391 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | ||
392 | /* | ||
393 | * We overlapped in the middle of an existing key: that | ||
394 | * means we have to split the old key. But we have to do | ||
395 | * slightly different things depending on whether the | ||
396 | * old key has been written out yet. | ||
397 | */ | ||
398 | |||
399 | struct bkey *top; | ||
400 | |||
401 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); | ||
402 | |||
403 | if (bkey_written(b, k)) { | ||
404 | /* | ||
405 | * We insert a new key to cover the top of the | ||
406 | * old key, and the old key is modified in place | ||
407 | * to represent the bottom split. | ||
408 | * | ||
409 | * It's completely arbitrary whether the new key | ||
410 | * is the top or the bottom, but it has to match | ||
411 | * up with what btree_sort_fixup() does - it | ||
412 | * doesn't check for this kind of overlap, it | ||
413 | * depends on us inserting a new key for the top | ||
414 | * here. | ||
415 | */ | ||
416 | top = bch_bset_search(b, bset_tree_last(b), | ||
417 | insert); | ||
418 | bch_bset_insert(b, top, k); | ||
419 | } else { | ||
420 | BKEY_PADDED(key) temp; | ||
421 | bkey_copy(&temp.key, k); | ||
422 | bch_bset_insert(b, k, &temp.key); | ||
423 | top = bkey_next(k); | ||
424 | } | ||
425 | |||
426 | bch_cut_front(insert, top); | ||
427 | bch_cut_back(&START_KEY(insert), k); | ||
428 | bch_bset_fix_invalidated_key(b, k); | ||
429 | goto out; | ||
430 | } | ||
431 | |||
432 | if (bkey_cmp(insert, k) < 0) { | ||
433 | bch_cut_front(insert, k); | ||
434 | } else { | ||
435 | if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) | ||
436 | old_offset = KEY_START(insert); | ||
437 | |||
438 | if (bkey_written(b, k) && | ||
439 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | ||
440 | /* | ||
441 | * Completely overwrote, so we don't have to | ||
442 | * invalidate the binary search tree | ||
443 | */ | ||
444 | bch_cut_front(k, k); | ||
445 | } else { | ||
446 | __bch_cut_back(&START_KEY(insert), k); | ||
447 | bch_bset_fix_invalidated_key(b, k); | ||
448 | } | ||
449 | } | ||
450 | |||
451 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); | ||
452 | } | ||
453 | |||
454 | check_failed: | ||
455 | if (replace_key) { | ||
456 | if (!sectors_found) { | ||
457 | return true; | ||
458 | } else if (sectors_found < KEY_SIZE(insert)) { | ||
459 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | ||
460 | (KEY_SIZE(insert) - sectors_found)); | ||
461 | SET_KEY_SIZE(insert, sectors_found); | ||
462 | } | ||
463 | } | ||
464 | out: | ||
465 | if (KEY_DIRTY(insert)) | ||
466 | bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), | ||
467 | KEY_START(insert), | ||
468 | KEY_SIZE(insert)); | ||
469 | |||
470 | return false; | ||
471 | } | ||
472 | |||
473 | static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) | ||
474 | { | ||
475 | struct btree *b = container_of(bk, struct btree, keys); | ||
476 | char buf[80]; | ||
477 | |||
478 | if (!KEY_SIZE(k)) | ||
479 | return true; | ||
480 | |||
481 | if (KEY_SIZE(k) > KEY_OFFSET(k)) | ||
482 | goto bad; | ||
483 | |||
484 | if (__ptr_invalid(b->c, k)) | ||
485 | goto bad; | ||
486 | |||
487 | return false; | ||
488 | bad: | ||
489 | bch_extent_to_text(buf, sizeof(buf), k); | ||
490 | cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); | ||
491 | return true; | ||
492 | } | ||
493 | |||
494 | static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, | ||
495 | unsigned ptr) | ||
496 | { | ||
497 | struct bucket *g = PTR_BUCKET(b->c, k, ptr); | ||
498 | char buf[80]; | ||
499 | |||
500 | if (mutex_trylock(&b->c->bucket_lock)) { | ||
501 | if (b->c->gc_mark_valid && | ||
502 | ((GC_MARK(g) != GC_MARK_DIRTY && | ||
503 | KEY_DIRTY(k)) || | ||
504 | GC_MARK(g) == GC_MARK_METADATA)) | ||
505 | goto err; | ||
506 | |||
507 | if (g->prio == BTREE_PRIO) | ||
508 | goto err; | ||
509 | |||
510 | mutex_unlock(&b->c->bucket_lock); | ||
511 | } | ||
512 | |||
513 | return false; | ||
514 | err: | ||
515 | mutex_unlock(&b->c->bucket_lock); | ||
516 | bch_extent_to_text(buf, sizeof(buf), k); | ||
517 | btree_bug(b, | ||
518 | "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
519 | buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), | ||
520 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
521 | return true; | ||
522 | } | ||
523 | |||
524 | static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) | ||
525 | { | ||
526 | struct btree *b = container_of(bk, struct btree, keys); | ||
527 | struct bucket *g; | ||
528 | unsigned i, stale; | ||
529 | |||
530 | if (!KEY_PTRS(k) || | ||
531 | bch_extent_invalid(bk, k)) | ||
532 | return true; | ||
533 | |||
534 | for (i = 0; i < KEY_PTRS(k); i++) | ||
535 | if (!ptr_available(b->c, k, i)) | ||
536 | return true; | ||
537 | |||
538 | if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) | ||
539 | return false; | ||
540 | |||
541 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
542 | g = PTR_BUCKET(b->c, k, i); | ||
543 | stale = ptr_stale(b->c, k, i); | ||
544 | |||
545 | btree_bug_on(stale > 96, b, | ||
546 | "key too stale: %i, need_gc %u", | ||
547 | stale, b->c->need_gc); | ||
548 | |||
549 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | ||
550 | b, "stale dirty pointer"); | ||
551 | |||
552 | if (stale) | ||
553 | return true; | ||
554 | |||
555 | if (expensive_debug_checks(b->c) && | ||
556 | bch_extent_bad_expensive(b, k, i)) | ||
557 | return true; | ||
558 | } | ||
559 | |||
560 | return false; | ||
561 | } | ||
562 | |||
563 | static uint64_t merge_chksums(struct bkey *l, struct bkey *r) | ||
564 | { | ||
565 | return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & | ||
566 | ~((uint64_t)1 << 63); | ||
567 | } | ||
568 | |||
569 | static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) | ||
570 | { | ||
571 | struct btree *b = container_of(bk, struct btree, keys); | ||
572 | unsigned i; | ||
573 | |||
574 | if (key_merging_disabled(b->c)) | ||
575 | return false; | ||
576 | |||
577 | for (i = 0; i < KEY_PTRS(l); i++) | ||
578 | if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || | ||
579 | PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) | ||
580 | return false; | ||
581 | |||
582 | /* Keys with no pointers aren't restricted to one bucket and could | ||
583 | * overflow KEY_SIZE | ||
584 | */ | ||
585 | if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { | ||
586 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); | ||
587 | SET_KEY_SIZE(l, USHRT_MAX); | ||
588 | |||
589 | bch_cut_front(l, r); | ||
590 | return false; | ||
591 | } | ||
592 | |||
593 | if (KEY_CSUM(l)) { | ||
594 | if (KEY_CSUM(r)) | ||
595 | l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); | ||
596 | else | ||
597 | SET_KEY_CSUM(l, 0); | ||
598 | } | ||
599 | |||
600 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); | ||
601 | SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); | ||
602 | |||
603 | return true; | ||
604 | } | ||
605 | |||
606 | const struct btree_keys_ops bch_extent_keys_ops = { | ||
607 | .sort_cmp = bch_extent_sort_cmp, | ||
608 | .sort_fixup = bch_extent_sort_fixup, | ||
609 | .insert_fixup = bch_extent_insert_fixup, | ||
610 | .key_invalid = bch_extent_invalid, | ||
611 | .key_bad = bch_extent_bad, | ||
612 | .key_merge = bch_extent_merge, | ||
613 | .key_to_text = bch_extent_to_text, | ||
614 | .key_dump = bch_bkey_dump, | ||
615 | .is_extents = true, | ||
616 | }; | ||
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h new file mode 100644 index 000000000000..e4e23409782d --- /dev/null +++ b/drivers/md/bcache/extents.h | |||
@@ -0,0 +1,13 @@ | |||
1 | #ifndef _BCACHE_EXTENTS_H | ||
2 | #define _BCACHE_EXTENTS_H | ||
3 | |||
4 | extern const struct btree_keys_ops bch_btree_keys_ops; | ||
5 | extern const struct btree_keys_ops bch_extent_keys_ops; | ||
6 | |||
7 | struct bkey; | ||
8 | struct cache_set; | ||
9 | |||
10 | void bch_extent_to_text(char *, size_t, const struct bkey *); | ||
11 | bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); | ||
12 | |||
13 | #endif /* _BCACHE_EXTENTS_H */ | ||
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 9056632995b1..fa028fa82df4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c | |||
@@ -11,178 +11,40 @@ | |||
11 | 11 | ||
12 | #include <linux/blkdev.h> | 12 | #include <linux/blkdev.h> |
13 | 13 | ||
14 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) | ||
15 | { | ||
16 | struct bio *p = bio->bi_private; | ||
17 | |||
18 | bio_endio(p, error); | ||
19 | bio_put(bio); | ||
20 | } | ||
21 | |||
22 | static void bch_generic_make_request_hack(struct bio *bio) | ||
23 | { | ||
24 | if (bio->bi_idx) { | ||
25 | struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); | ||
26 | |||
27 | memcpy(clone->bi_io_vec, | ||
28 | bio_iovec(bio), | ||
29 | bio_segments(bio) * sizeof(struct bio_vec)); | ||
30 | |||
31 | clone->bi_sector = bio->bi_sector; | ||
32 | clone->bi_bdev = bio->bi_bdev; | ||
33 | clone->bi_rw = bio->bi_rw; | ||
34 | clone->bi_vcnt = bio_segments(bio); | ||
35 | clone->bi_size = bio->bi_size; | ||
36 | |||
37 | clone->bi_private = bio; | ||
38 | clone->bi_end_io = bch_bi_idx_hack_endio; | ||
39 | |||
40 | bio = clone; | ||
41 | } | ||
42 | |||
43 | /* | ||
44 | * Hack, since drivers that clone bios clone up to bi_max_vecs, but our | ||
45 | * bios might have had more than that (before we split them per device | ||
46 | * limitations). | ||
47 | * | ||
48 | * To be taken out once immutable bvec stuff is in. | ||
49 | */ | ||
50 | bio->bi_max_vecs = bio->bi_vcnt; | ||
51 | |||
52 | generic_make_request(bio); | ||
53 | } | ||
54 | |||
55 | /** | ||
56 | * bch_bio_split - split a bio | ||
57 | * @bio: bio to split | ||
58 | * @sectors: number of sectors to split from the front of @bio | ||
59 | * @gfp: gfp mask | ||
60 | * @bs: bio set to allocate from | ||
61 | * | ||
62 | * Allocates and returns a new bio which represents @sectors from the start of | ||
63 | * @bio, and updates @bio to represent the remaining sectors. | ||
64 | * | ||
65 | * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio | ||
66 | * unchanged. | ||
67 | * | ||
68 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a | ||
69 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not | ||
70 | * freed before the split. | ||
71 | */ | ||
72 | struct bio *bch_bio_split(struct bio *bio, int sectors, | ||
73 | gfp_t gfp, struct bio_set *bs) | ||
74 | { | ||
75 | unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; | ||
76 | struct bio_vec *bv; | ||
77 | struct bio *ret = NULL; | ||
78 | |||
79 | BUG_ON(sectors <= 0); | ||
80 | |||
81 | if (sectors >= bio_sectors(bio)) | ||
82 | return bio; | ||
83 | |||
84 | if (bio->bi_rw & REQ_DISCARD) { | ||
85 | ret = bio_alloc_bioset(gfp, 1, bs); | ||
86 | if (!ret) | ||
87 | return NULL; | ||
88 | idx = 0; | ||
89 | goto out; | ||
90 | } | ||
91 | |||
92 | bio_for_each_segment(bv, bio, idx) { | ||
93 | vcnt = idx - bio->bi_idx; | ||
94 | |||
95 | if (!nbytes) { | ||
96 | ret = bio_alloc_bioset(gfp, vcnt, bs); | ||
97 | if (!ret) | ||
98 | return NULL; | ||
99 | |||
100 | memcpy(ret->bi_io_vec, bio_iovec(bio), | ||
101 | sizeof(struct bio_vec) * vcnt); | ||
102 | |||
103 | break; | ||
104 | } else if (nbytes < bv->bv_len) { | ||
105 | ret = bio_alloc_bioset(gfp, ++vcnt, bs); | ||
106 | if (!ret) | ||
107 | return NULL; | ||
108 | |||
109 | memcpy(ret->bi_io_vec, bio_iovec(bio), | ||
110 | sizeof(struct bio_vec) * vcnt); | ||
111 | |||
112 | ret->bi_io_vec[vcnt - 1].bv_len = nbytes; | ||
113 | bv->bv_offset += nbytes; | ||
114 | bv->bv_len -= nbytes; | ||
115 | break; | ||
116 | } | ||
117 | |||
118 | nbytes -= bv->bv_len; | ||
119 | } | ||
120 | out: | ||
121 | ret->bi_bdev = bio->bi_bdev; | ||
122 | ret->bi_sector = bio->bi_sector; | ||
123 | ret->bi_size = sectors << 9; | ||
124 | ret->bi_rw = bio->bi_rw; | ||
125 | ret->bi_vcnt = vcnt; | ||
126 | ret->bi_max_vecs = vcnt; | ||
127 | |||
128 | bio->bi_sector += sectors; | ||
129 | bio->bi_size -= sectors << 9; | ||
130 | bio->bi_idx = idx; | ||
131 | |||
132 | if (bio_integrity(bio)) { | ||
133 | if (bio_integrity_clone(ret, bio, gfp)) { | ||
134 | bio_put(ret); | ||
135 | return NULL; | ||
136 | } | ||
137 | |||
138 | bio_integrity_trim(ret, 0, bio_sectors(ret)); | ||
139 | bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); | ||
140 | } | ||
141 | |||
142 | return ret; | ||
143 | } | ||
144 | |||
145 | static unsigned bch_bio_max_sectors(struct bio *bio) | 14 | static unsigned bch_bio_max_sectors(struct bio *bio) |
146 | { | 15 | { |
147 | unsigned ret = bio_sectors(bio); | ||
148 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | 16 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); |
149 | unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, | 17 | struct bio_vec bv; |
150 | queue_max_segments(q)); | 18 | struct bvec_iter iter; |
19 | unsigned ret = 0, seg = 0; | ||
151 | 20 | ||
152 | if (bio->bi_rw & REQ_DISCARD) | 21 | if (bio->bi_rw & REQ_DISCARD) |
153 | return min(ret, q->limits.max_discard_sectors); | 22 | return min(bio_sectors(bio), q->limits.max_discard_sectors); |
154 | 23 | ||
155 | if (bio_segments(bio) > max_segments || | 24 | bio_for_each_segment(bv, bio, iter) { |
156 | q->merge_bvec_fn) { | 25 | struct bvec_merge_data bvm = { |
157 | struct bio_vec *bv; | 26 | .bi_bdev = bio->bi_bdev, |
158 | int i, seg = 0; | 27 | .bi_sector = bio->bi_iter.bi_sector, |
159 | 28 | .bi_size = ret << 9, | |
160 | ret = 0; | 29 | .bi_rw = bio->bi_rw, |
161 | 30 | }; | |
162 | bio_for_each_segment(bv, bio, i) { | 31 | |
163 | struct bvec_merge_data bvm = { | 32 | if (seg == min_t(unsigned, BIO_MAX_PAGES, |
164 | .bi_bdev = bio->bi_bdev, | 33 | queue_max_segments(q))) |
165 | .bi_sector = bio->bi_sector, | 34 | break; |
166 | .bi_size = ret << 9, | ||
167 | .bi_rw = bio->bi_rw, | ||
168 | }; | ||
169 | |||
170 | if (seg == max_segments) | ||
171 | break; | ||
172 | 35 | ||
173 | if (q->merge_bvec_fn && | 36 | if (q->merge_bvec_fn && |
174 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) | 37 | q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len) |
175 | break; | 38 | break; |
176 | 39 | ||
177 | seg++; | 40 | seg++; |
178 | ret += bv->bv_len >> 9; | 41 | ret += bv.bv_len >> 9; |
179 | } | ||
180 | } | 42 | } |
181 | 43 | ||
182 | ret = min(ret, queue_max_sectors(q)); | 44 | ret = min(ret, queue_max_sectors(q)); |
183 | 45 | ||
184 | WARN_ON(!ret); | 46 | WARN_ON(!ret); |
185 | ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); | 47 | ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9); |
186 | 48 | ||
187 | return ret; | 49 | return ret; |
188 | } | 50 | } |
@@ -193,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl) | |||
193 | 55 | ||
194 | s->bio->bi_end_io = s->bi_end_io; | 56 | s->bio->bi_end_io = s->bi_end_io; |
195 | s->bio->bi_private = s->bi_private; | 57 | s->bio->bi_private = s->bi_private; |
196 | bio_endio(s->bio, 0); | 58 | bio_endio_nodec(s->bio, 0); |
197 | 59 | ||
198 | closure_debug_destroy(&s->cl); | 60 | closure_debug_destroy(&s->cl); |
199 | mempool_free(s, s->p->bio_split_hook); | 61 | mempool_free(s, s->p->bio_split_hook); |
@@ -232,19 +94,19 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | |||
232 | bio_get(bio); | 94 | bio_get(bio); |
233 | 95 | ||
234 | do { | 96 | do { |
235 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), | 97 | n = bio_next_split(bio, bch_bio_max_sectors(bio), |
236 | GFP_NOIO, s->p->bio_split); | 98 | GFP_NOIO, s->p->bio_split); |
237 | 99 | ||
238 | n->bi_end_io = bch_bio_submit_split_endio; | 100 | n->bi_end_io = bch_bio_submit_split_endio; |
239 | n->bi_private = &s->cl; | 101 | n->bi_private = &s->cl; |
240 | 102 | ||
241 | closure_get(&s->cl); | 103 | closure_get(&s->cl); |
242 | bch_generic_make_request_hack(n); | 104 | generic_make_request(n); |
243 | } while (n != bio); | 105 | } while (n != bio); |
244 | 106 | ||
245 | continue_at(&s->cl, bch_bio_submit_split_done, NULL); | 107 | continue_at(&s->cl, bch_bio_submit_split_done, NULL); |
246 | submit: | 108 | submit: |
247 | bch_generic_make_request_hack(bio); | 109 | generic_make_request(bio); |
248 | } | 110 | } |
249 | 111 | ||
250 | /* Bios with headers */ | 112 | /* Bios with headers */ |
@@ -272,8 +134,8 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) | |||
272 | { | 134 | { |
273 | struct bbio *b = container_of(bio, struct bbio, bio); | 135 | struct bbio *b = container_of(bio, struct bbio, bio); |
274 | 136 | ||
275 | bio->bi_sector = PTR_OFFSET(&b->key, 0); | 137 | bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); |
276 | bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; | 138 | bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; |
277 | 139 | ||
278 | b->submit_time_us = local_clock_us(); | 140 | b->submit_time_us = local_clock_us(); |
279 | closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); | 141 | closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); |
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index ecdaa671bd50..18039affc306 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
@@ -44,17 +44,17 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, | |||
44 | 44 | ||
45 | closure_init_stack(&cl); | 45 | closure_init_stack(&cl); |
46 | 46 | ||
47 | pr_debug("reading %llu", (uint64_t) bucket); | 47 | pr_debug("reading %u", bucket_index); |
48 | 48 | ||
49 | while (offset < ca->sb.bucket_size) { | 49 | while (offset < ca->sb.bucket_size) { |
50 | reread: left = ca->sb.bucket_size - offset; | 50 | reread: left = ca->sb.bucket_size - offset; |
51 | len = min_t(unsigned, left, PAGE_SECTORS * 8); | 51 | len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); |
52 | 52 | ||
53 | bio_reset(bio); | 53 | bio_reset(bio); |
54 | bio->bi_sector = bucket + offset; | 54 | bio->bi_iter.bi_sector = bucket + offset; |
55 | bio->bi_bdev = ca->bdev; | 55 | bio->bi_bdev = ca->bdev; |
56 | bio->bi_rw = READ; | 56 | bio->bi_rw = READ; |
57 | bio->bi_size = len << 9; | 57 | bio->bi_iter.bi_size = len << 9; |
58 | 58 | ||
59 | bio->bi_end_io = journal_read_endio; | 59 | bio->bi_end_io = journal_read_endio; |
60 | bio->bi_private = &cl; | 60 | bio->bi_private = &cl; |
@@ -74,19 +74,28 @@ reread: left = ca->sb.bucket_size - offset; | |||
74 | struct list_head *where; | 74 | struct list_head *where; |
75 | size_t blocks, bytes = set_bytes(j); | 75 | size_t blocks, bytes = set_bytes(j); |
76 | 76 | ||
77 | if (j->magic != jset_magic(&ca->sb)) | 77 | if (j->magic != jset_magic(&ca->sb)) { |
78 | pr_debug("%u: bad magic", bucket_index); | ||
78 | return ret; | 79 | return ret; |
80 | } | ||
79 | 81 | ||
80 | if (bytes > left << 9) | 82 | if (bytes > left << 9 || |
83 | bytes > PAGE_SIZE << JSET_BITS) { | ||
84 | pr_info("%u: too big, %zu bytes, offset %u", | ||
85 | bucket_index, bytes, offset); | ||
81 | return ret; | 86 | return ret; |
87 | } | ||
82 | 88 | ||
83 | if (bytes > len << 9) | 89 | if (bytes > len << 9) |
84 | goto reread; | 90 | goto reread; |
85 | 91 | ||
86 | if (j->csum != csum_set(j)) | 92 | if (j->csum != csum_set(j)) { |
93 | pr_info("%u: bad csum, %zu bytes, offset %u", | ||
94 | bucket_index, bytes, offset); | ||
87 | return ret; | 95 | return ret; |
96 | } | ||
88 | 97 | ||
89 | blocks = set_blocks(j, ca->set); | 98 | blocks = set_blocks(j, block_bytes(ca->set)); |
90 | 99 | ||
91 | while (!list_empty(list)) { | 100 | while (!list_empty(list)) { |
92 | i = list_first_entry(list, | 101 | i = list_first_entry(list, |
@@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) | |||
275 | } | 284 | } |
276 | 285 | ||
277 | for (k = i->j.start; | 286 | for (k = i->j.start; |
278 | k < end(&i->j); | 287 | k < bset_bkey_last(&i->j); |
279 | k = bkey_next(k)) { | 288 | k = bkey_next(k)) { |
280 | unsigned j; | 289 | unsigned j; |
281 | 290 | ||
@@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) | |||
313 | n, i->j.seq - 1, start, end); | 322 | n, i->j.seq - 1, start, end); |
314 | 323 | ||
315 | for (k = i->j.start; | 324 | for (k = i->j.start; |
316 | k < end(&i->j); | 325 | k < bset_bkey_last(&i->j); |
317 | k = bkey_next(k)) { | 326 | k = bkey_next(k)) { |
318 | trace_bcache_journal_replay_key(k); | 327 | trace_bcache_journal_replay_key(k); |
319 | 328 | ||
@@ -437,13 +446,13 @@ static void do_journal_discard(struct cache *ca) | |||
437 | atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); | 446 | atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); |
438 | 447 | ||
439 | bio_init(bio); | 448 | bio_init(bio); |
440 | bio->bi_sector = bucket_to_sector(ca->set, | 449 | bio->bi_iter.bi_sector = bucket_to_sector(ca->set, |
441 | ca->sb.d[ja->discard_idx]); | 450 | ca->sb.d[ja->discard_idx]); |
442 | bio->bi_bdev = ca->bdev; | 451 | bio->bi_bdev = ca->bdev; |
443 | bio->bi_rw = REQ_WRITE|REQ_DISCARD; | 452 | bio->bi_rw = REQ_WRITE|REQ_DISCARD; |
444 | bio->bi_max_vecs = 1; | 453 | bio->bi_max_vecs = 1; |
445 | bio->bi_io_vec = bio->bi_inline_vecs; | 454 | bio->bi_io_vec = bio->bi_inline_vecs; |
446 | bio->bi_size = bucket_bytes(ca); | 455 | bio->bi_iter.bi_size = bucket_bytes(ca); |
447 | bio->bi_end_io = journal_discard_endio; | 456 | bio->bi_end_io = journal_discard_endio; |
448 | 457 | ||
449 | closure_get(&ca->set->cl); | 458 | closure_get(&ca->set->cl); |
@@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl) | |||
555 | continue_at_nobarrier(cl, journal_write, system_wq); | 564 | continue_at_nobarrier(cl, journal_write, system_wq); |
556 | } | 565 | } |
557 | 566 | ||
567 | static void journal_write_unlock(struct closure *cl) | ||
568 | { | ||
569 | struct cache_set *c = container_of(cl, struct cache_set, journal.io); | ||
570 | |||
571 | c->journal.io_in_flight = 0; | ||
572 | spin_unlock(&c->journal.lock); | ||
573 | } | ||
574 | |||
558 | static void journal_write_unlocked(struct closure *cl) | 575 | static void journal_write_unlocked(struct closure *cl) |
559 | __releases(c->journal.lock) | 576 | __releases(c->journal.lock) |
560 | { | 577 | { |
@@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl) | |||
562 | struct cache *ca; | 579 | struct cache *ca; |
563 | struct journal_write *w = c->journal.cur; | 580 | struct journal_write *w = c->journal.cur; |
564 | struct bkey *k = &c->journal.key; | 581 | struct bkey *k = &c->journal.key; |
565 | unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; | 582 | unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * |
583 | c->sb.block_size; | ||
566 | 584 | ||
567 | struct bio *bio; | 585 | struct bio *bio; |
568 | struct bio_list list; | 586 | struct bio_list list; |
569 | bio_list_init(&list); | 587 | bio_list_init(&list); |
570 | 588 | ||
571 | if (!w->need_write) { | 589 | if (!w->need_write) { |
572 | /* | 590 | closure_return_with_destructor(cl, journal_write_unlock); |
573 | * XXX: have to unlock closure before we unlock journal lock, | ||
574 | * else we race with bch_journal(). But this way we race | ||
575 | * against cache set unregister. Doh. | ||
576 | */ | ||
577 | set_closure_fn(cl, NULL, NULL); | ||
578 | closure_sub(cl, CLOSURE_RUNNING + 1); | ||
579 | spin_unlock(&c->journal.lock); | ||
580 | return; | ||
581 | } else if (journal_full(&c->journal)) { | 591 | } else if (journal_full(&c->journal)) { |
582 | journal_reclaim(c); | 592 | journal_reclaim(c); |
583 | spin_unlock(&c->journal.lock); | 593 | spin_unlock(&c->journal.lock); |
@@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) | |||
586 | continue_at(cl, journal_write, system_wq); | 596 | continue_at(cl, journal_write, system_wq); |
587 | } | 597 | } |
588 | 598 | ||
589 | c->journal.blocks_free -= set_blocks(w->data, c); | 599 | c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); |
590 | 600 | ||
591 | w->data->btree_level = c->root->level; | 601 | w->data->btree_level = c->root->level; |
592 | 602 | ||
@@ -608,10 +618,10 @@ static void journal_write_unlocked(struct closure *cl) | |||
608 | atomic_long_add(sectors, &ca->meta_sectors_written); | 618 | atomic_long_add(sectors, &ca->meta_sectors_written); |
609 | 619 | ||
610 | bio_reset(bio); | 620 | bio_reset(bio); |
611 | bio->bi_sector = PTR_OFFSET(k, i); | 621 | bio->bi_iter.bi_sector = PTR_OFFSET(k, i); |
612 | bio->bi_bdev = ca->bdev; | 622 | bio->bi_bdev = ca->bdev; |
613 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; | 623 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; |
614 | bio->bi_size = sectors << 9; | 624 | bio->bi_iter.bi_size = sectors << 9; |
615 | 625 | ||
616 | bio->bi_end_io = journal_write_endio; | 626 | bio->bi_end_io = journal_write_endio; |
617 | bio->bi_private = w; | 627 | bio->bi_private = w; |
@@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c) | |||
653 | 663 | ||
654 | w->need_write = true; | 664 | w->need_write = true; |
655 | 665 | ||
656 | if (closure_trylock(cl, &c->cl)) | 666 | if (!c->journal.io_in_flight) { |
657 | journal_write_unlocked(cl); | 667 | c->journal.io_in_flight = 1; |
658 | else | 668 | closure_call(cl, journal_write_unlocked, NULL, &c->cl); |
669 | } else { | ||
659 | spin_unlock(&c->journal.lock); | 670 | spin_unlock(&c->journal.lock); |
671 | } | ||
660 | } | 672 | } |
661 | 673 | ||
662 | static struct journal_write *journal_wait_for_write(struct cache_set *c, | 674 | static struct journal_write *journal_wait_for_write(struct cache_set *c, |
@@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | |||
664 | { | 676 | { |
665 | size_t sectors; | 677 | size_t sectors; |
666 | struct closure cl; | 678 | struct closure cl; |
679 | bool wait = false; | ||
667 | 680 | ||
668 | closure_init_stack(&cl); | 681 | closure_init_stack(&cl); |
669 | 682 | ||
@@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | |||
673 | struct journal_write *w = c->journal.cur; | 686 | struct journal_write *w = c->journal.cur; |
674 | 687 | ||
675 | sectors = __set_blocks(w->data, w->data->keys + nkeys, | 688 | sectors = __set_blocks(w->data, w->data->keys + nkeys, |
676 | c) * c->sb.block_size; | 689 | block_bytes(c)) * c->sb.block_size; |
677 | 690 | ||
678 | if (sectors <= min_t(size_t, | 691 | if (sectors <= min_t(size_t, |
679 | c->journal.blocks_free * c->sb.block_size, | 692 | c->journal.blocks_free * c->sb.block_size, |
680 | PAGE_SECTORS << JSET_BITS)) | 693 | PAGE_SECTORS << JSET_BITS)) |
681 | return w; | 694 | return w; |
682 | 695 | ||
683 | /* XXX: tracepoint */ | 696 | if (wait) |
697 | closure_wait(&c->journal.wait, &cl); | ||
698 | |||
684 | if (!journal_full(&c->journal)) { | 699 | if (!journal_full(&c->journal)) { |
685 | trace_bcache_journal_entry_full(c); | 700 | if (wait) |
701 | trace_bcache_journal_entry_full(c); | ||
686 | 702 | ||
687 | /* | 703 | /* |
688 | * XXX: If we were inserting so many keys that they | 704 | * XXX: If we were inserting so many keys that they |
@@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | |||
692 | */ | 708 | */ |
693 | BUG_ON(!w->data->keys); | 709 | BUG_ON(!w->data->keys); |
694 | 710 | ||
695 | closure_wait(&w->wait, &cl); | ||
696 | journal_try_write(c); /* unlocks */ | 711 | journal_try_write(c); /* unlocks */ |
697 | } else { | 712 | } else { |
698 | trace_bcache_journal_full(c); | 713 | if (wait) |
714 | trace_bcache_journal_full(c); | ||
699 | 715 | ||
700 | closure_wait(&c->journal.wait, &cl); | ||
701 | journal_reclaim(c); | 716 | journal_reclaim(c); |
702 | spin_unlock(&c->journal.lock); | 717 | spin_unlock(&c->journal.lock); |
703 | 718 | ||
@@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, | |||
706 | 721 | ||
707 | closure_sync(&cl); | 722 | closure_sync(&cl); |
708 | spin_lock(&c->journal.lock); | 723 | spin_lock(&c->journal.lock); |
724 | wait = true; | ||
709 | } | 725 | } |
710 | } | 726 | } |
711 | 727 | ||
@@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c, | |||
736 | 752 | ||
737 | w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); | 753 | w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); |
738 | 754 | ||
739 | memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); | 755 | memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); |
740 | w->data->keys += bch_keylist_nkeys(keys); | 756 | w->data->keys += bch_keylist_nkeys(keys); |
741 | 757 | ||
742 | ret = &fifo_back(&c->journal.pin); | 758 | ret = &fifo_back(&c->journal.pin); |
@@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c) | |||
780 | { | 796 | { |
781 | struct journal *j = &c->journal; | 797 | struct journal *j = &c->journal; |
782 | 798 | ||
783 | closure_init_unlocked(&j->io); | ||
784 | spin_lock_init(&j->lock); | 799 | spin_lock_init(&j->lock); |
785 | INIT_DELAYED_WORK(&j->work, journal_write_work); | 800 | INIT_DELAYED_WORK(&j->work, journal_write_work); |
786 | 801 | ||
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index a6472fda94b2..9180c4465075 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h | |||
@@ -104,6 +104,7 @@ struct journal { | |||
104 | /* used when waiting because the journal was full */ | 104 | /* used when waiting because the journal was full */ |
105 | struct closure_waitlist wait; | 105 | struct closure_waitlist wait; |
106 | struct closure io; | 106 | struct closure io; |
107 | int io_in_flight; | ||
107 | struct delayed_work work; | 108 | struct delayed_work work; |
108 | 109 | ||
109 | /* Number of blocks free in the bucket(s) we're currently writing to */ | 110 | /* Number of blocks free in the bucket(s) we're currently writing to */ |
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index f2f0998c4a91..9eb60d102de8 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c | |||
@@ -86,7 +86,7 @@ static void moving_init(struct moving_io *io) | |||
86 | bio_get(bio); | 86 | bio_get(bio); |
87 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | 87 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); |
88 | 88 | ||
89 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; | 89 | bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; |
90 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), | 90 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), |
91 | PAGE_SECTORS); | 91 | PAGE_SECTORS); |
92 | bio->bi_private = &io->cl; | 92 | bio->bi_private = &io->cl; |
@@ -102,7 +102,7 @@ static void write_moving(struct closure *cl) | |||
102 | if (!op->error) { | 102 | if (!op->error) { |
103 | moving_init(io); | 103 | moving_init(io); |
104 | 104 | ||
105 | io->bio.bio.bi_sector = KEY_START(&io->w->key); | 105 | io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key); |
106 | op->write_prio = 1; | 106 | op->write_prio = 1; |
107 | op->bio = &io->bio.bio; | 107 | op->bio = &io->bio.bio; |
108 | 108 | ||
@@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c) | |||
211 | for_each_cache(ca, c, i) { | 211 | for_each_cache(ca, c, i) { |
212 | unsigned sectors_to_move = 0; | 212 | unsigned sectors_to_move = 0; |
213 | unsigned reserve_sectors = ca->sb.bucket_size * | 213 | unsigned reserve_sectors = ca->sb.bucket_size * |
214 | min(fifo_used(&ca->free), ca->free.size / 2); | 214 | fifo_used(&ca->free[RESERVE_MOVINGGC]); |
215 | 215 | ||
216 | ca->heap.used = 0; | 216 | ca->heap.used = 0; |
217 | 217 | ||
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index fbcc851ed5a5..5d5d031cf381 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) | |||
163 | static void bcachecg_destroy(struct cgroup *cgroup) | 163 | static void bcachecg_destroy(struct cgroup *cgroup) |
164 | { | 164 | { |
165 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); | 165 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); |
166 | free_css_id(&bcache_subsys, &cg->css); | ||
167 | kfree(cg); | 166 | kfree(cg); |
168 | } | 167 | } |
169 | 168 | ||
@@ -198,14 +197,14 @@ static bool verify(struct cached_dev *dc, struct bio *bio) | |||
198 | 197 | ||
199 | static void bio_csum(struct bio *bio, struct bkey *k) | 198 | static void bio_csum(struct bio *bio, struct bkey *k) |
200 | { | 199 | { |
201 | struct bio_vec *bv; | 200 | struct bio_vec bv; |
201 | struct bvec_iter iter; | ||
202 | uint64_t csum = 0; | 202 | uint64_t csum = 0; |
203 | int i; | ||
204 | 203 | ||
205 | bio_for_each_segment(bv, bio, i) { | 204 | bio_for_each_segment(bv, bio, iter) { |
206 | void *d = kmap(bv->bv_page) + bv->bv_offset; | 205 | void *d = kmap(bv.bv_page) + bv.bv_offset; |
207 | csum = bch_crc64_update(csum, d, bv->bv_len); | 206 | csum = bch_crc64_update(csum, d, bv.bv_len); |
208 | kunmap(bv->bv_page); | 207 | kunmap(bv.bv_page); |
209 | } | 208 | } |
210 | 209 | ||
211 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); | 210 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); |
@@ -255,26 +254,44 @@ static void bch_data_insert_keys(struct closure *cl) | |||
255 | closure_return(cl); | 254 | closure_return(cl); |
256 | } | 255 | } |
257 | 256 | ||
257 | static int bch_keylist_realloc(struct keylist *l, unsigned u64s, | ||
258 | struct cache_set *c) | ||
259 | { | ||
260 | size_t oldsize = bch_keylist_nkeys(l); | ||
261 | size_t newsize = oldsize + u64s; | ||
262 | |||
263 | /* | ||
264 | * The journalling code doesn't handle the case where the keys to insert | ||
265 | * is bigger than an empty write: If we just return -ENOMEM here, | ||
266 | * bio_insert() and bio_invalidate() will insert the keys created so far | ||
267 | * and finish the rest when the keylist is empty. | ||
268 | */ | ||
269 | if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) | ||
270 | return -ENOMEM; | ||
271 | |||
272 | return __bch_keylist_realloc(l, u64s); | ||
273 | } | ||
274 | |||
258 | static void bch_data_invalidate(struct closure *cl) | 275 | static void bch_data_invalidate(struct closure *cl) |
259 | { | 276 | { |
260 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); | 277 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
261 | struct bio *bio = op->bio; | 278 | struct bio *bio = op->bio; |
262 | 279 | ||
263 | pr_debug("invalidating %i sectors from %llu", | 280 | pr_debug("invalidating %i sectors from %llu", |
264 | bio_sectors(bio), (uint64_t) bio->bi_sector); | 281 | bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); |
265 | 282 | ||
266 | while (bio_sectors(bio)) { | 283 | while (bio_sectors(bio)) { |
267 | unsigned sectors = min(bio_sectors(bio), | 284 | unsigned sectors = min(bio_sectors(bio), |
268 | 1U << (KEY_SIZE_BITS - 1)); | 285 | 1U << (KEY_SIZE_BITS - 1)); |
269 | 286 | ||
270 | if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) | 287 | if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) |
271 | goto out; | 288 | goto out; |
272 | 289 | ||
273 | bio->bi_sector += sectors; | 290 | bio->bi_iter.bi_sector += sectors; |
274 | bio->bi_size -= sectors << 9; | 291 | bio->bi_iter.bi_size -= sectors << 9; |
275 | 292 | ||
276 | bch_keylist_add(&op->insert_keys, | 293 | bch_keylist_add(&op->insert_keys, |
277 | &KEY(op->inode, bio->bi_sector, sectors)); | 294 | &KEY(op->inode, bio->bi_iter.bi_sector, sectors)); |
278 | } | 295 | } |
279 | 296 | ||
280 | op->insert_data_done = true; | 297 | op->insert_data_done = true; |
@@ -336,14 +353,14 @@ static void bch_data_insert_start(struct closure *cl) | |||
336 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); | 353 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
337 | struct bio *bio = op->bio, *n; | 354 | struct bio *bio = op->bio, *n; |
338 | 355 | ||
339 | if (op->bypass) | ||
340 | return bch_data_invalidate(cl); | ||
341 | |||
342 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { | 356 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { |
343 | set_gc_sectors(op->c); | 357 | set_gc_sectors(op->c); |
344 | wake_up_gc(op->c); | 358 | wake_up_gc(op->c); |
345 | } | 359 | } |
346 | 360 | ||
361 | if (op->bypass) | ||
362 | return bch_data_invalidate(cl); | ||
363 | |||
347 | /* | 364 | /* |
348 | * Journal writes are marked REQ_FLUSH; if the original write was a | 365 | * Journal writes are marked REQ_FLUSH; if the original write was a |
349 | * flush, it'll wait on the journal write. | 366 | * flush, it'll wait on the journal write. |
@@ -357,21 +374,21 @@ static void bch_data_insert_start(struct closure *cl) | |||
357 | 374 | ||
358 | /* 1 for the device pointer and 1 for the chksum */ | 375 | /* 1 for the device pointer and 1 for the chksum */ |
359 | if (bch_keylist_realloc(&op->insert_keys, | 376 | if (bch_keylist_realloc(&op->insert_keys, |
360 | 1 + (op->csum ? 1 : 0), | 377 | 3 + (op->csum ? 1 : 0), |
361 | op->c)) | 378 | op->c)) |
362 | continue_at(cl, bch_data_insert_keys, bcache_wq); | 379 | continue_at(cl, bch_data_insert_keys, bcache_wq); |
363 | 380 | ||
364 | k = op->insert_keys.top; | 381 | k = op->insert_keys.top; |
365 | bkey_init(k); | 382 | bkey_init(k); |
366 | SET_KEY_INODE(k, op->inode); | 383 | SET_KEY_INODE(k, op->inode); |
367 | SET_KEY_OFFSET(k, bio->bi_sector); | 384 | SET_KEY_OFFSET(k, bio->bi_iter.bi_sector); |
368 | 385 | ||
369 | if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), | 386 | if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), |
370 | op->write_point, op->write_prio, | 387 | op->write_point, op->write_prio, |
371 | op->writeback)) | 388 | op->writeback)) |
372 | goto err; | 389 | goto err; |
373 | 390 | ||
374 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | 391 | n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); |
375 | 392 | ||
376 | n->bi_end_io = bch_data_insert_endio; | 393 | n->bi_end_io = bch_data_insert_endio; |
377 | n->bi_private = cl; | 394 | n->bi_private = cl; |
@@ -522,7 +539,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) | |||
522 | (bio->bi_rw & REQ_WRITE))) | 539 | (bio->bi_rw & REQ_WRITE))) |
523 | goto skip; | 540 | goto skip; |
524 | 541 | ||
525 | if (bio->bi_sector & (c->sb.block_size - 1) || | 542 | if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || |
526 | bio_sectors(bio) & (c->sb.block_size - 1)) { | 543 | bio_sectors(bio) & (c->sb.block_size - 1)) { |
527 | pr_debug("skipping unaligned io"); | 544 | pr_debug("skipping unaligned io"); |
528 | goto skip; | 545 | goto skip; |
@@ -546,8 +563,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) | |||
546 | 563 | ||
547 | spin_lock(&dc->io_lock); | 564 | spin_lock(&dc->io_lock); |
548 | 565 | ||
549 | hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) | 566 | hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) |
550 | if (i->last == bio->bi_sector && | 567 | if (i->last == bio->bi_iter.bi_sector && |
551 | time_before(jiffies, i->jiffies)) | 568 | time_before(jiffies, i->jiffies)) |
552 | goto found; | 569 | goto found; |
553 | 570 | ||
@@ -556,8 +573,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) | |||
556 | add_sequential(task); | 573 | add_sequential(task); |
557 | i->sequential = 0; | 574 | i->sequential = 0; |
558 | found: | 575 | found: |
559 | if (i->sequential + bio->bi_size > i->sequential) | 576 | if (i->sequential + bio->bi_iter.bi_size > i->sequential) |
560 | i->sequential += bio->bi_size; | 577 | i->sequential += bio->bi_iter.bi_size; |
561 | 578 | ||
562 | i->last = bio_end_sector(bio); | 579 | i->last = bio_end_sector(bio); |
563 | i->jiffies = jiffies + msecs_to_jiffies(5000); | 580 | i->jiffies = jiffies + msecs_to_jiffies(5000); |
@@ -597,16 +614,13 @@ struct search { | |||
597 | /* Stack frame for bio_complete */ | 614 | /* Stack frame for bio_complete */ |
598 | struct closure cl; | 615 | struct closure cl; |
599 | 616 | ||
600 | struct bcache_device *d; | ||
601 | |||
602 | struct bbio bio; | 617 | struct bbio bio; |
603 | struct bio *orig_bio; | 618 | struct bio *orig_bio; |
604 | struct bio *cache_miss; | 619 | struct bio *cache_miss; |
620 | struct bcache_device *d; | ||
605 | 621 | ||
606 | unsigned insert_bio_sectors; | 622 | unsigned insert_bio_sectors; |
607 | |||
608 | unsigned recoverable:1; | 623 | unsigned recoverable:1; |
609 | unsigned unaligned_bvec:1; | ||
610 | unsigned write:1; | 624 | unsigned write:1; |
611 | unsigned read_dirty_data:1; | 625 | unsigned read_dirty_data:1; |
612 | 626 | ||
@@ -631,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error) | |||
631 | 645 | ||
632 | if (error) | 646 | if (error) |
633 | s->iop.error = error; | 647 | s->iop.error = error; |
634 | else if (ptr_stale(s->iop.c, &b->key, 0)) { | 648 | else if (!KEY_DIRTY(&b->key) && |
649 | ptr_stale(s->iop.c, &b->key, 0)) { | ||
635 | atomic_long_inc(&s->iop.c->cache_read_races); | 650 | atomic_long_inc(&s->iop.c->cache_read_races); |
636 | s->iop.error = -EINTR; | 651 | s->iop.error = -EINTR; |
637 | } | 652 | } |
@@ -650,15 +665,15 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) | |||
650 | struct bkey *bio_key; | 665 | struct bkey *bio_key; |
651 | unsigned ptr; | 666 | unsigned ptr; |
652 | 667 | ||
653 | if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0) | 668 | if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) |
654 | return MAP_CONTINUE; | 669 | return MAP_CONTINUE; |
655 | 670 | ||
656 | if (KEY_INODE(k) != s->iop.inode || | 671 | if (KEY_INODE(k) != s->iop.inode || |
657 | KEY_START(k) > bio->bi_sector) { | 672 | KEY_START(k) > bio->bi_iter.bi_sector) { |
658 | unsigned bio_sectors = bio_sectors(bio); | 673 | unsigned bio_sectors = bio_sectors(bio); |
659 | unsigned sectors = KEY_INODE(k) == s->iop.inode | 674 | unsigned sectors = KEY_INODE(k) == s->iop.inode |
660 | ? min_t(uint64_t, INT_MAX, | 675 | ? min_t(uint64_t, INT_MAX, |
661 | KEY_START(k) - bio->bi_sector) | 676 | KEY_START(k) - bio->bi_iter.bi_sector) |
662 | : INT_MAX; | 677 | : INT_MAX; |
663 | 678 | ||
664 | int ret = s->d->cache_miss(b, s, bio, sectors); | 679 | int ret = s->d->cache_miss(b, s, bio, sectors); |
@@ -680,14 +695,14 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) | |||
680 | if (KEY_DIRTY(k)) | 695 | if (KEY_DIRTY(k)) |
681 | s->read_dirty_data = true; | 696 | s->read_dirty_data = true; |
682 | 697 | ||
683 | n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, | 698 | n = bio_next_split(bio, min_t(uint64_t, INT_MAX, |
684 | KEY_OFFSET(k) - bio->bi_sector), | 699 | KEY_OFFSET(k) - bio->bi_iter.bi_sector), |
685 | GFP_NOIO, s->d->bio_split); | 700 | GFP_NOIO, s->d->bio_split); |
686 | 701 | ||
687 | bio_key = &container_of(n, struct bbio, bio)->key; | 702 | bio_key = &container_of(n, struct bbio, bio)->key; |
688 | bch_bkey_copy_single_ptr(bio_key, k, ptr); | 703 | bch_bkey_copy_single_ptr(bio_key, k, ptr); |
689 | 704 | ||
690 | bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key); | 705 | bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); |
691 | bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); | 706 | bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); |
692 | 707 | ||
693 | n->bi_end_io = bch_cache_read_endio; | 708 | n->bi_end_io = bch_cache_read_endio; |
@@ -712,10 +727,13 @@ static void cache_lookup(struct closure *cl) | |||
712 | { | 727 | { |
713 | struct search *s = container_of(cl, struct search, iop.cl); | 728 | struct search *s = container_of(cl, struct search, iop.cl); |
714 | struct bio *bio = &s->bio.bio; | 729 | struct bio *bio = &s->bio.bio; |
730 | int ret; | ||
731 | |||
732 | bch_btree_op_init(&s->op, -1); | ||
715 | 733 | ||
716 | int ret = bch_btree_map_keys(&s->op, s->iop.c, | 734 | ret = bch_btree_map_keys(&s->op, s->iop.c, |
717 | &KEY(s->iop.inode, bio->bi_sector, 0), | 735 | &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), |
718 | cache_lookup_fn, MAP_END_KEY); | 736 | cache_lookup_fn, MAP_END_KEY); |
719 | if (ret == -EAGAIN) | 737 | if (ret == -EAGAIN) |
720 | continue_at(cl, cache_lookup, bcache_wq); | 738 | continue_at(cl, cache_lookup, bcache_wq); |
721 | 739 | ||
@@ -756,13 +774,15 @@ static void bio_complete(struct search *s) | |||
756 | } | 774 | } |
757 | } | 775 | } |
758 | 776 | ||
759 | static void do_bio_hook(struct search *s) | 777 | static void do_bio_hook(struct search *s, struct bio *orig_bio) |
760 | { | 778 | { |
761 | struct bio *bio = &s->bio.bio; | 779 | struct bio *bio = &s->bio.bio; |
762 | memcpy(bio, s->orig_bio, sizeof(struct bio)); | ||
763 | 780 | ||
781 | bio_init(bio); | ||
782 | __bio_clone_fast(bio, orig_bio); | ||
764 | bio->bi_end_io = request_endio; | 783 | bio->bi_end_io = request_endio; |
765 | bio->bi_private = &s->cl; | 784 | bio->bi_private = &s->cl; |
785 | |||
766 | atomic_set(&bio->bi_cnt, 3); | 786 | atomic_set(&bio->bi_cnt, 3); |
767 | } | 787 | } |
768 | 788 | ||
@@ -774,43 +794,36 @@ static void search_free(struct closure *cl) | |||
774 | if (s->iop.bio) | 794 | if (s->iop.bio) |
775 | bio_put(s->iop.bio); | 795 | bio_put(s->iop.bio); |
776 | 796 | ||
777 | if (s->unaligned_bvec) | ||
778 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); | ||
779 | |||
780 | closure_debug_destroy(cl); | 797 | closure_debug_destroy(cl); |
781 | mempool_free(s, s->d->c->search); | 798 | mempool_free(s, s->d->c->search); |
782 | } | 799 | } |
783 | 800 | ||
784 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | 801 | static inline struct search *search_alloc(struct bio *bio, |
802 | struct bcache_device *d) | ||
785 | { | 803 | { |
786 | struct search *s; | 804 | struct search *s; |
787 | struct bio_vec *bv; | ||
788 | 805 | ||
789 | s = mempool_alloc(d->c->search, GFP_NOIO); | 806 | s = mempool_alloc(d->c->search, GFP_NOIO); |
790 | memset(s, 0, offsetof(struct search, iop.insert_keys)); | ||
791 | 807 | ||
792 | __closure_init(&s->cl, NULL); | 808 | closure_init(&s->cl, NULL); |
809 | do_bio_hook(s, bio); | ||
793 | 810 | ||
794 | s->iop.inode = d->id; | ||
795 | s->iop.c = d->c; | ||
796 | s->d = d; | ||
797 | s->op.lock = -1; | ||
798 | s->iop.write_point = hash_long((unsigned long) current, 16); | ||
799 | s->orig_bio = bio; | 811 | s->orig_bio = bio; |
800 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | 812 | s->cache_miss = NULL; |
801 | s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; | 813 | s->d = d; |
802 | s->recoverable = 1; | 814 | s->recoverable = 1; |
815 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | ||
816 | s->read_dirty_data = 0; | ||
803 | s->start_time = jiffies; | 817 | s->start_time = jiffies; |
804 | do_bio_hook(s); | ||
805 | 818 | ||
806 | if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { | 819 | s->iop.c = d->c; |
807 | bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); | 820 | s->iop.bio = NULL; |
808 | memcpy(bv, bio_iovec(bio), | 821 | s->iop.inode = d->id; |
809 | sizeof(struct bio_vec) * bio_segments(bio)); | 822 | s->iop.write_point = hash_long((unsigned long) current, 16); |
810 | 823 | s->iop.write_prio = 0; | |
811 | s->bio.bio.bi_io_vec = bv; | 824 | s->iop.error = 0; |
812 | s->unaligned_bvec = 1; | 825 | s->iop.flags = 0; |
813 | } | 826 | s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; |
814 | 827 | ||
815 | return s; | 828 | return s; |
816 | } | 829 | } |
@@ -850,26 +863,13 @@ static void cached_dev_read_error(struct closure *cl) | |||
850 | { | 863 | { |
851 | struct search *s = container_of(cl, struct search, cl); | 864 | struct search *s = container_of(cl, struct search, cl); |
852 | struct bio *bio = &s->bio.bio; | 865 | struct bio *bio = &s->bio.bio; |
853 | struct bio_vec *bv; | ||
854 | int i; | ||
855 | 866 | ||
856 | if (s->recoverable) { | 867 | if (s->recoverable) { |
857 | /* Retry from the backing device: */ | 868 | /* Retry from the backing device: */ |
858 | trace_bcache_read_retry(s->orig_bio); | 869 | trace_bcache_read_retry(s->orig_bio); |
859 | 870 | ||
860 | s->iop.error = 0; | 871 | s->iop.error = 0; |
861 | bv = s->bio.bio.bi_io_vec; | 872 | do_bio_hook(s, s->orig_bio); |
862 | do_bio_hook(s); | ||
863 | s->bio.bio.bi_io_vec = bv; | ||
864 | |||
865 | if (!s->unaligned_bvec) | ||
866 | bio_for_each_segment(bv, s->orig_bio, i) | ||
867 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | ||
868 | else | ||
869 | memcpy(s->bio.bio.bi_io_vec, | ||
870 | bio_iovec(s->orig_bio), | ||
871 | sizeof(struct bio_vec) * | ||
872 | bio_segments(s->orig_bio)); | ||
873 | 873 | ||
874 | /* XXX: invalidate cache */ | 874 | /* XXX: invalidate cache */ |
875 | 875 | ||
@@ -894,9 +894,9 @@ static void cached_dev_read_done(struct closure *cl) | |||
894 | 894 | ||
895 | if (s->iop.bio) { | 895 | if (s->iop.bio) { |
896 | bio_reset(s->iop.bio); | 896 | bio_reset(s->iop.bio); |
897 | s->iop.bio->bi_sector = s->cache_miss->bi_sector; | 897 | s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; |
898 | s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; | 898 | s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; |
899 | s->iop.bio->bi_size = s->insert_bio_sectors << 9; | 899 | s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; |
900 | bch_bio_map(s->iop.bio, NULL); | 900 | bch_bio_map(s->iop.bio, NULL); |
901 | 901 | ||
902 | bio_copy_data(s->cache_miss, s->iop.bio); | 902 | bio_copy_data(s->cache_miss, s->iop.bio); |
@@ -905,8 +905,7 @@ static void cached_dev_read_done(struct closure *cl) | |||
905 | s->cache_miss = NULL; | 905 | s->cache_miss = NULL; |
906 | } | 906 | } |
907 | 907 | ||
908 | if (verify(dc, &s->bio.bio) && s->recoverable && | 908 | if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) |
909 | !s->unaligned_bvec && !s->read_dirty_data) | ||
910 | bch_data_verify(dc, s->orig_bio); | 909 | bch_data_verify(dc, s->orig_bio); |
911 | 910 | ||
912 | bio_complete(s); | 911 | bio_complete(s); |
@@ -946,7 +945,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
946 | struct bio *miss, *cache_bio; | 945 | struct bio *miss, *cache_bio; |
947 | 946 | ||
948 | if (s->cache_miss || s->iop.bypass) { | 947 | if (s->cache_miss || s->iop.bypass) { |
949 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 948 | miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
950 | ret = miss == bio ? MAP_DONE : MAP_CONTINUE; | 949 | ret = miss == bio ? MAP_DONE : MAP_CONTINUE; |
951 | goto out_submit; | 950 | goto out_submit; |
952 | } | 951 | } |
@@ -960,7 +959,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
960 | s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); | 959 | s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); |
961 | 960 | ||
962 | s->iop.replace_key = KEY(s->iop.inode, | 961 | s->iop.replace_key = KEY(s->iop.inode, |
963 | bio->bi_sector + s->insert_bio_sectors, | 962 | bio->bi_iter.bi_sector + s->insert_bio_sectors, |
964 | s->insert_bio_sectors); | 963 | s->insert_bio_sectors); |
965 | 964 | ||
966 | ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); | 965 | ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); |
@@ -969,7 +968,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
969 | 968 | ||
970 | s->iop.replace = true; | 969 | s->iop.replace = true; |
971 | 970 | ||
972 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 971 | miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
973 | 972 | ||
974 | /* btree_search_recurse()'s btree iterator is no good anymore */ | 973 | /* btree_search_recurse()'s btree iterator is no good anymore */ |
975 | ret = miss == bio ? MAP_DONE : -EINTR; | 974 | ret = miss == bio ? MAP_DONE : -EINTR; |
@@ -980,9 +979,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
980 | if (!cache_bio) | 979 | if (!cache_bio) |
981 | goto out_submit; | 980 | goto out_submit; |
982 | 981 | ||
983 | cache_bio->bi_sector = miss->bi_sector; | 982 | cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; |
984 | cache_bio->bi_bdev = miss->bi_bdev; | 983 | cache_bio->bi_bdev = miss->bi_bdev; |
985 | cache_bio->bi_size = s->insert_bio_sectors << 9; | 984 | cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; |
986 | 985 | ||
987 | cache_bio->bi_end_io = request_endio; | 986 | cache_bio->bi_end_io = request_endio; |
988 | cache_bio->bi_private = &s->cl; | 987 | cache_bio->bi_private = &s->cl; |
@@ -1032,7 +1031,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) | |||
1032 | { | 1031 | { |
1033 | struct closure *cl = &s->cl; | 1032 | struct closure *cl = &s->cl; |
1034 | struct bio *bio = &s->bio.bio; | 1033 | struct bio *bio = &s->bio.bio; |
1035 | struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); | 1034 | struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0); |
1036 | struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); | 1035 | struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); |
1037 | 1036 | ||
1038 | bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); | 1037 | bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); |
@@ -1088,8 +1087,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) | |||
1088 | closure_bio_submit(flush, cl, s->d); | 1087 | closure_bio_submit(flush, cl, s->d); |
1089 | } | 1088 | } |
1090 | } else { | 1089 | } else { |
1091 | s->iop.bio = bio_clone_bioset(bio, GFP_NOIO, | 1090 | s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); |
1092 | dc->disk.bio_split); | ||
1093 | 1091 | ||
1094 | closure_bio_submit(bio, cl, s->d); | 1092 | closure_bio_submit(bio, cl, s->d); |
1095 | } | 1093 | } |
@@ -1127,13 +1125,13 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) | |||
1127 | part_stat_unlock(); | 1125 | part_stat_unlock(); |
1128 | 1126 | ||
1129 | bio->bi_bdev = dc->bdev; | 1127 | bio->bi_bdev = dc->bdev; |
1130 | bio->bi_sector += dc->sb.data_offset; | 1128 | bio->bi_iter.bi_sector += dc->sb.data_offset; |
1131 | 1129 | ||
1132 | if (cached_dev_get(dc)) { | 1130 | if (cached_dev_get(dc)) { |
1133 | s = search_alloc(bio, d); | 1131 | s = search_alloc(bio, d); |
1134 | trace_bcache_request_start(s->d, bio); | 1132 | trace_bcache_request_start(s->d, bio); |
1135 | 1133 | ||
1136 | if (!bio->bi_size) { | 1134 | if (!bio->bi_iter.bi_size) { |
1137 | /* | 1135 | /* |
1138 | * can't call bch_journal_meta from under | 1136 | * can't call bch_journal_meta from under |
1139 | * generic_make_request | 1137 | * generic_make_request |
@@ -1205,24 +1203,24 @@ void bch_cached_dev_request_init(struct cached_dev *dc) | |||
1205 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | 1203 | static int flash_dev_cache_miss(struct btree *b, struct search *s, |
1206 | struct bio *bio, unsigned sectors) | 1204 | struct bio *bio, unsigned sectors) |
1207 | { | 1205 | { |
1208 | struct bio_vec *bv; | 1206 | struct bio_vec bv; |
1209 | int i; | 1207 | struct bvec_iter iter; |
1210 | 1208 | ||
1211 | /* Zero fill bio */ | 1209 | /* Zero fill bio */ |
1212 | 1210 | ||
1213 | bio_for_each_segment(bv, bio, i) { | 1211 | bio_for_each_segment(bv, bio, iter) { |
1214 | unsigned j = min(bv->bv_len >> 9, sectors); | 1212 | unsigned j = min(bv.bv_len >> 9, sectors); |
1215 | 1213 | ||
1216 | void *p = kmap(bv->bv_page); | 1214 | void *p = kmap(bv.bv_page); |
1217 | memset(p + bv->bv_offset, 0, j << 9); | 1215 | memset(p + bv.bv_offset, 0, j << 9); |
1218 | kunmap(bv->bv_page); | 1216 | kunmap(bv.bv_page); |
1219 | 1217 | ||
1220 | sectors -= j; | 1218 | sectors -= j; |
1221 | } | 1219 | } |
1222 | 1220 | ||
1223 | bio_advance(bio, min(sectors << 9, bio->bi_size)); | 1221 | bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size)); |
1224 | 1222 | ||
1225 | if (!bio->bi_size) | 1223 | if (!bio->bi_iter.bi_size) |
1226 | return MAP_DONE; | 1224 | return MAP_DONE; |
1227 | 1225 | ||
1228 | return MAP_CONTINUE; | 1226 | return MAP_CONTINUE; |
@@ -1256,7 +1254,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | |||
1256 | 1254 | ||
1257 | trace_bcache_request_start(s->d, bio); | 1255 | trace_bcache_request_start(s->d, bio); |
1258 | 1256 | ||
1259 | if (!bio->bi_size) { | 1257 | if (!bio->bi_iter.bi_size) { |
1260 | /* | 1258 | /* |
1261 | * can't call bch_journal_meta from under | 1259 | * can't call bch_journal_meta from under |
1262 | * generic_make_request | 1260 | * generic_make_request |
@@ -1266,7 +1264,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | |||
1266 | bcache_wq); | 1264 | bcache_wq); |
1267 | } else if (rw) { | 1265 | } else if (rw) { |
1268 | bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, | 1266 | bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, |
1269 | &KEY(d->id, bio->bi_sector, 0), | 1267 | &KEY(d->id, bio->bi_iter.bi_sector, 0), |
1270 | &KEY(d->id, bio_end_sector(bio), 0)); | 1268 | &KEY(d->id, bio_end_sector(bio), 0)); |
1271 | 1269 | ||
1272 | s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; | 1270 | s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; |
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 2cd65bf073c2..39f21dbedc38 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h | |||
@@ -13,17 +13,22 @@ struct data_insert_op { | |||
13 | uint16_t write_prio; | 13 | uint16_t write_prio; |
14 | short error; | 14 | short error; |
15 | 15 | ||
16 | unsigned bypass:1; | 16 | union { |
17 | unsigned writeback:1; | 17 | uint16_t flags; |
18 | unsigned flush_journal:1; | ||
19 | unsigned csum:1; | ||
20 | 18 | ||
21 | unsigned replace:1; | 19 | struct { |
22 | unsigned replace_collision:1; | 20 | unsigned bypass:1; |
21 | unsigned writeback:1; | ||
22 | unsigned flush_journal:1; | ||
23 | unsigned csum:1; | ||
23 | 24 | ||
24 | unsigned insert_data_done:1; | 25 | unsigned replace:1; |
26 | unsigned replace_collision:1; | ||
27 | |||
28 | unsigned insert_data_done:1; | ||
29 | }; | ||
30 | }; | ||
25 | 31 | ||
26 | /* Anything past this point won't get zeroed in search_alloc() */ | ||
27 | struct keylist insert_keys; | 32 | struct keylist insert_keys; |
28 | BKEY_PADDED(replace_key); | 33 | BKEY_PADDED(replace_key); |
29 | }; | 34 | }; |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c57bfa071a57..24a3a1546caa 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include "bcache.h" | 9 | #include "bcache.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "extents.h" | ||
12 | #include "request.h" | 13 | #include "request.h" |
13 | #include "writeback.h" | 14 | #include "writeback.h" |
14 | 15 | ||
@@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error) | |||
225 | struct cached_dev *dc = bio->bi_private; | 226 | struct cached_dev *dc = bio->bi_private; |
226 | /* XXX: error checking */ | 227 | /* XXX: error checking */ |
227 | 228 | ||
228 | closure_put(&dc->sb_write.cl); | 229 | closure_put(&dc->sb_write); |
229 | } | 230 | } |
230 | 231 | ||
231 | static void __write_super(struct cache_sb *sb, struct bio *bio) | 232 | static void __write_super(struct cache_sb *sb, struct bio *bio) |
@@ -233,9 +234,9 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) | |||
233 | struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); | 234 | struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); |
234 | unsigned i; | 235 | unsigned i; |
235 | 236 | ||
236 | bio->bi_sector = SB_SECTOR; | 237 | bio->bi_iter.bi_sector = SB_SECTOR; |
237 | bio->bi_rw = REQ_SYNC|REQ_META; | 238 | bio->bi_rw = REQ_SYNC|REQ_META; |
238 | bio->bi_size = SB_SIZE; | 239 | bio->bi_iter.bi_size = SB_SIZE; |
239 | bch_bio_map(bio, NULL); | 240 | bch_bio_map(bio, NULL); |
240 | 241 | ||
241 | out->offset = cpu_to_le64(sb->offset); | 242 | out->offset = cpu_to_le64(sb->offset); |
@@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) | |||
263 | submit_bio(REQ_WRITE, bio); | 264 | submit_bio(REQ_WRITE, bio); |
264 | } | 265 | } |
265 | 266 | ||
267 | static void bch_write_bdev_super_unlock(struct closure *cl) | ||
268 | { | ||
269 | struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); | ||
270 | |||
271 | up(&dc->sb_write_mutex); | ||
272 | } | ||
273 | |||
266 | void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) | 274 | void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) |
267 | { | 275 | { |
268 | struct closure *cl = &dc->sb_write.cl; | 276 | struct closure *cl = &dc->sb_write; |
269 | struct bio *bio = &dc->sb_bio; | 277 | struct bio *bio = &dc->sb_bio; |
270 | 278 | ||
271 | closure_lock(&dc->sb_write, parent); | 279 | down(&dc->sb_write_mutex); |
280 | closure_init(cl, parent); | ||
272 | 281 | ||
273 | bio_reset(bio); | 282 | bio_reset(bio); |
274 | bio->bi_bdev = dc->bdev; | 283 | bio->bi_bdev = dc->bdev; |
@@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) | |||
278 | closure_get(cl); | 287 | closure_get(cl); |
279 | __write_super(&dc->sb, bio); | 288 | __write_super(&dc->sb, bio); |
280 | 289 | ||
281 | closure_return(cl); | 290 | closure_return_with_destructor(cl, bch_write_bdev_super_unlock); |
282 | } | 291 | } |
283 | 292 | ||
284 | static void write_super_endio(struct bio *bio, int error) | 293 | static void write_super_endio(struct bio *bio, int error) |
@@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error) | |||
286 | struct cache *ca = bio->bi_private; | 295 | struct cache *ca = bio->bi_private; |
287 | 296 | ||
288 | bch_count_io_errors(ca, error, "writing superblock"); | 297 | bch_count_io_errors(ca, error, "writing superblock"); |
289 | closure_put(&ca->set->sb_write.cl); | 298 | closure_put(&ca->set->sb_write); |
299 | } | ||
300 | |||
301 | static void bcache_write_super_unlock(struct closure *cl) | ||
302 | { | ||
303 | struct cache_set *c = container_of(cl, struct cache_set, sb_write); | ||
304 | |||
305 | up(&c->sb_write_mutex); | ||
290 | } | 306 | } |
291 | 307 | ||
292 | void bcache_write_super(struct cache_set *c) | 308 | void bcache_write_super(struct cache_set *c) |
293 | { | 309 | { |
294 | struct closure *cl = &c->sb_write.cl; | 310 | struct closure *cl = &c->sb_write; |
295 | struct cache *ca; | 311 | struct cache *ca; |
296 | unsigned i; | 312 | unsigned i; |
297 | 313 | ||
298 | closure_lock(&c->sb_write, &c->cl); | 314 | down(&c->sb_write_mutex); |
315 | closure_init(cl, &c->cl); | ||
299 | 316 | ||
300 | c->sb.seq++; | 317 | c->sb.seq++; |
301 | 318 | ||
@@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c) | |||
317 | __write_super(&ca->sb, bio); | 334 | __write_super(&ca->sb, bio); |
318 | } | 335 | } |
319 | 336 | ||
320 | closure_return(cl); | 337 | closure_return_with_destructor(cl, bcache_write_super_unlock); |
321 | } | 338 | } |
322 | 339 | ||
323 | /* UUID io */ | 340 | /* UUID io */ |
@@ -325,29 +342,37 @@ void bcache_write_super(struct cache_set *c) | |||
325 | static void uuid_endio(struct bio *bio, int error) | 342 | static void uuid_endio(struct bio *bio, int error) |
326 | { | 343 | { |
327 | struct closure *cl = bio->bi_private; | 344 | struct closure *cl = bio->bi_private; |
328 | struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); | 345 | struct cache_set *c = container_of(cl, struct cache_set, uuid_write); |
329 | 346 | ||
330 | cache_set_err_on(error, c, "accessing uuids"); | 347 | cache_set_err_on(error, c, "accessing uuids"); |
331 | bch_bbio_free(bio, c); | 348 | bch_bbio_free(bio, c); |
332 | closure_put(cl); | 349 | closure_put(cl); |
333 | } | 350 | } |
334 | 351 | ||
352 | static void uuid_io_unlock(struct closure *cl) | ||
353 | { | ||
354 | struct cache_set *c = container_of(cl, struct cache_set, uuid_write); | ||
355 | |||
356 | up(&c->uuid_write_mutex); | ||
357 | } | ||
358 | |||
335 | static void uuid_io(struct cache_set *c, unsigned long rw, | 359 | static void uuid_io(struct cache_set *c, unsigned long rw, |
336 | struct bkey *k, struct closure *parent) | 360 | struct bkey *k, struct closure *parent) |
337 | { | 361 | { |
338 | struct closure *cl = &c->uuid_write.cl; | 362 | struct closure *cl = &c->uuid_write; |
339 | struct uuid_entry *u; | 363 | struct uuid_entry *u; |
340 | unsigned i; | 364 | unsigned i; |
341 | char buf[80]; | 365 | char buf[80]; |
342 | 366 | ||
343 | BUG_ON(!parent); | 367 | BUG_ON(!parent); |
344 | closure_lock(&c->uuid_write, parent); | 368 | down(&c->uuid_write_mutex); |
369 | closure_init(cl, parent); | ||
345 | 370 | ||
346 | for (i = 0; i < KEY_PTRS(k); i++) { | 371 | for (i = 0; i < KEY_PTRS(k); i++) { |
347 | struct bio *bio = bch_bbio_alloc(c); | 372 | struct bio *bio = bch_bbio_alloc(c); |
348 | 373 | ||
349 | bio->bi_rw = REQ_SYNC|REQ_META|rw; | 374 | bio->bi_rw = REQ_SYNC|REQ_META|rw; |
350 | bio->bi_size = KEY_SIZE(k) << 9; | 375 | bio->bi_iter.bi_size = KEY_SIZE(k) << 9; |
351 | 376 | ||
352 | bio->bi_end_io = uuid_endio; | 377 | bio->bi_end_io = uuid_endio; |
353 | bio->bi_private = cl; | 378 | bio->bi_private = cl; |
@@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | |||
359 | break; | 384 | break; |
360 | } | 385 | } |
361 | 386 | ||
362 | bch_bkey_to_text(buf, sizeof(buf), k); | 387 | bch_extent_to_text(buf, sizeof(buf), k); |
363 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); | 388 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); |
364 | 389 | ||
365 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) | 390 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) |
@@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | |||
368 | u - c->uuids, u->uuid, u->label, | 393 | u - c->uuids, u->uuid, u->label, |
369 | u->first_reg, u->last_reg, u->invalidated); | 394 | u->first_reg, u->last_reg, u->invalidated); |
370 | 395 | ||
371 | closure_return(cl); | 396 | closure_return_with_destructor(cl, uuid_io_unlock); |
372 | } | 397 | } |
373 | 398 | ||
374 | static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) | 399 | static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) |
375 | { | 400 | { |
376 | struct bkey *k = &j->uuid_bucket; | 401 | struct bkey *k = &j->uuid_bucket; |
377 | 402 | ||
378 | if (bch_btree_ptr_invalid(c, k)) | 403 | if (__bch_btree_ptr_invalid(c, k)) |
379 | return "bad uuid pointer"; | 404 | return "bad uuid pointer"; |
380 | 405 | ||
381 | bkey_copy(&c->uuid_bucket, k); | 406 | bkey_copy(&c->uuid_bucket, k); |
@@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c) | |||
420 | 445 | ||
421 | lockdep_assert_held(&bch_register_lock); | 446 | lockdep_assert_held(&bch_register_lock); |
422 | 447 | ||
423 | if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) | 448 | if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) |
424 | return 1; | 449 | return 1; |
425 | 450 | ||
426 | SET_KEY_SIZE(&k.key, c->sb.bucket_size); | 451 | SET_KEY_SIZE(&k.key, c->sb.bucket_size); |
@@ -503,10 +528,10 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) | |||
503 | 528 | ||
504 | closure_init_stack(cl); | 529 | closure_init_stack(cl); |
505 | 530 | ||
506 | bio->bi_sector = bucket * ca->sb.bucket_size; | 531 | bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; |
507 | bio->bi_bdev = ca->bdev; | 532 | bio->bi_bdev = ca->bdev; |
508 | bio->bi_rw = REQ_SYNC|REQ_META|rw; | 533 | bio->bi_rw = REQ_SYNC|REQ_META|rw; |
509 | bio->bi_size = bucket_bytes(ca); | 534 | bio->bi_iter.bi_size = bucket_bytes(ca); |
510 | 535 | ||
511 | bio->bi_end_io = prio_endio; | 536 | bio->bi_end_io = prio_endio; |
512 | bio->bi_private = ca; | 537 | bio->bi_private = ca; |
@@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca) | |||
538 | atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), | 563 | atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), |
539 | &ca->meta_sectors_written); | 564 | &ca->meta_sectors_written); |
540 | 565 | ||
541 | pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), | 566 | //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), |
542 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | 567 | // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); |
543 | 568 | ||
544 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { | 569 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { |
545 | long bucket; | 570 | long bucket; |
@@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca) | |||
558 | p->magic = pset_magic(&ca->sb); | 583 | p->magic = pset_magic(&ca->sb); |
559 | p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); | 584 | p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); |
560 | 585 | ||
561 | bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); | 586 | bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); |
562 | BUG_ON(bucket == -1); | 587 | BUG_ON(bucket == -1); |
563 | 588 | ||
564 | mutex_unlock(&ca->set->bucket_lock); | 589 | mutex_unlock(&ca->set->bucket_lock); |
@@ -739,8 +764,6 @@ static void bcache_device_free(struct bcache_device *d) | |||
739 | } | 764 | } |
740 | 765 | ||
741 | bio_split_pool_free(&d->bio_split_hook); | 766 | bio_split_pool_free(&d->bio_split_hook); |
742 | if (d->unaligned_bvec) | ||
743 | mempool_destroy(d->unaligned_bvec); | ||
744 | if (d->bio_split) | 767 | if (d->bio_split) |
745 | bioset_free(d->bio_split); | 768 | bioset_free(d->bio_split); |
746 | if (is_vmalloc_addr(d->full_dirty_stripes)) | 769 | if (is_vmalloc_addr(d->full_dirty_stripes)) |
@@ -793,8 +816,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, | |||
793 | return minor; | 816 | return minor; |
794 | 817 | ||
795 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 818 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
796 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, | ||
797 | sizeof(struct bio_vec) * BIO_MAX_PAGES)) || | ||
798 | bio_split_pool_init(&d->bio_split_hook) || | 819 | bio_split_pool_init(&d->bio_split_hook) || |
799 | !(d->disk = alloc_disk(1))) { | 820 | !(d->disk = alloc_disk(1))) { |
800 | ida_simple_remove(&bcache_minor, minor); | 821 | ida_simple_remove(&bcache_minor, minor); |
@@ -1102,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | |||
1102 | set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); | 1123 | set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); |
1103 | kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); | 1124 | kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); |
1104 | INIT_WORK(&dc->detach, cached_dev_detach_finish); | 1125 | INIT_WORK(&dc->detach, cached_dev_detach_finish); |
1105 | closure_init_unlocked(&dc->sb_write); | 1126 | sema_init(&dc->sb_write_mutex, 1); |
1106 | INIT_LIST_HEAD(&dc->io_lru); | 1127 | INIT_LIST_HEAD(&dc->io_lru); |
1107 | spin_lock_init(&dc->io_lock); | 1128 | spin_lock_init(&dc->io_lock); |
1108 | bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); | 1129 | bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); |
@@ -1114,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | |||
1114 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); | 1135 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); |
1115 | } | 1136 | } |
1116 | 1137 | ||
1138 | dc->disk.stripe_size = q->limits.io_opt >> 9; | ||
1139 | |||
1140 | if (dc->disk.stripe_size) | ||
1141 | dc->partial_stripes_expensive = | ||
1142 | q->limits.raid_partial_stripes_expensive; | ||
1143 | |||
1117 | ret = bcache_device_init(&dc->disk, block_size, | 1144 | ret = bcache_device_init(&dc->disk, block_size, |
1118 | dc->bdev->bd_part->nr_sects - dc->sb.data_offset); | 1145 | dc->bdev->bd_part->nr_sects - dc->sb.data_offset); |
1119 | if (ret) | 1146 | if (ret) |
@@ -1325,8 +1352,8 @@ static void cache_set_free(struct closure *cl) | |||
1325 | if (ca) | 1352 | if (ca) |
1326 | kobject_put(&ca->kobj); | 1353 | kobject_put(&ca->kobj); |
1327 | 1354 | ||
1355 | bch_bset_sort_state_free(&c->sort); | ||
1328 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); | 1356 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); |
1329 | free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); | ||
1330 | 1357 | ||
1331 | if (c->bio_split) | 1358 | if (c->bio_split) |
1332 | bioset_free(c->bio_split); | 1359 | bioset_free(c->bio_split); |
@@ -1451,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1451 | c->block_bits = ilog2(sb->block_size); | 1478 | c->block_bits = ilog2(sb->block_size); |
1452 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); | 1479 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); |
1453 | 1480 | ||
1454 | c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; | 1481 | c->btree_pages = bucket_pages(c); |
1455 | if (c->btree_pages > BTREE_MAX_PAGES) | 1482 | if (c->btree_pages > BTREE_MAX_PAGES) |
1456 | c->btree_pages = max_t(int, c->btree_pages / 4, | 1483 | c->btree_pages = max_t(int, c->btree_pages / 4, |
1457 | BTREE_MAX_PAGES); | 1484 | BTREE_MAX_PAGES); |
1458 | 1485 | ||
1459 | c->sort_crit_factor = int_sqrt(c->btree_pages); | 1486 | sema_init(&c->sb_write_mutex, 1); |
1460 | |||
1461 | closure_init_unlocked(&c->sb_write); | ||
1462 | mutex_init(&c->bucket_lock); | 1487 | mutex_init(&c->bucket_lock); |
1463 | init_waitqueue_head(&c->try_wait); | 1488 | init_waitqueue_head(&c->try_wait); |
1464 | init_waitqueue_head(&c->bucket_wait); | 1489 | init_waitqueue_head(&c->bucket_wait); |
1465 | closure_init_unlocked(&c->uuid_write); | 1490 | sema_init(&c->uuid_write_mutex, 1); |
1466 | mutex_init(&c->sort_lock); | ||
1467 | 1491 | ||
1468 | spin_lock_init(&c->sort_time.lock); | ||
1469 | spin_lock_init(&c->btree_gc_time.lock); | 1492 | spin_lock_init(&c->btree_gc_time.lock); |
1470 | spin_lock_init(&c->btree_split_time.lock); | 1493 | spin_lock_init(&c->btree_split_time.lock); |
1471 | spin_lock_init(&c->btree_read_time.lock); | 1494 | spin_lock_init(&c->btree_read_time.lock); |
@@ -1493,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1493 | bucket_pages(c))) || | 1516 | bucket_pages(c))) || |
1494 | !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || | 1517 | !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || |
1495 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 1518 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
1496 | !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || | ||
1497 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || | 1519 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || |
1498 | bch_journal_alloc(c) || | 1520 | bch_journal_alloc(c) || |
1499 | bch_btree_cache_alloc(c) || | 1521 | bch_btree_cache_alloc(c) || |
1500 | bch_open_buckets_alloc(c)) | 1522 | bch_open_buckets_alloc(c) || |
1523 | bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) | ||
1501 | goto err; | 1524 | goto err; |
1502 | 1525 | ||
1503 | c->congested_read_threshold_us = 2000; | 1526 | c->congested_read_threshold_us = 2000; |
@@ -1553,7 +1576,7 @@ static void run_cache_set(struct cache_set *c) | |||
1553 | k = &j->btree_root; | 1576 | k = &j->btree_root; |
1554 | 1577 | ||
1555 | err = "bad btree root"; | 1578 | err = "bad btree root"; |
1556 | if (bch_btree_ptr_invalid(c, k)) | 1579 | if (__bch_btree_ptr_invalid(c, k)) |
1557 | goto err; | 1580 | goto err; |
1558 | 1581 | ||
1559 | err = "error reading btree root"; | 1582 | err = "error reading btree root"; |
@@ -1747,6 +1770,7 @@ err: | |||
1747 | void bch_cache_release(struct kobject *kobj) | 1770 | void bch_cache_release(struct kobject *kobj) |
1748 | { | 1771 | { |
1749 | struct cache *ca = container_of(kobj, struct cache, kobj); | 1772 | struct cache *ca = container_of(kobj, struct cache, kobj); |
1773 | unsigned i; | ||
1750 | 1774 | ||
1751 | if (ca->set) | 1775 | if (ca->set) |
1752 | ca->set->cache[ca->sb.nr_this_dev] = NULL; | 1776 | ca->set->cache[ca->sb.nr_this_dev] = NULL; |
@@ -1760,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj) | |||
1760 | free_heap(&ca->heap); | 1784 | free_heap(&ca->heap); |
1761 | free_fifo(&ca->unused); | 1785 | free_fifo(&ca->unused); |
1762 | free_fifo(&ca->free_inc); | 1786 | free_fifo(&ca->free_inc); |
1763 | free_fifo(&ca->free); | 1787 | |
1788 | for (i = 0; i < RESERVE_NR; i++) | ||
1789 | free_fifo(&ca->free[i]); | ||
1764 | 1790 | ||
1765 | if (ca->sb_bio.bi_inline_vecs[0].bv_page) | 1791 | if (ca->sb_bio.bi_inline_vecs[0].bv_page) |
1766 | put_page(ca->sb_bio.bi_io_vec[0].bv_page); | 1792 | put_page(ca->sb_bio.bi_io_vec[0].bv_page); |
@@ -1786,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | |||
1786 | ca->journal.bio.bi_max_vecs = 8; | 1812 | ca->journal.bio.bi_max_vecs = 8; |
1787 | ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; | 1813 | ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; |
1788 | 1814 | ||
1789 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; | 1815 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; |
1790 | free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); | ||
1791 | 1816 | ||
1792 | if (!init_fifo(&ca->free, free, GFP_KERNEL) || | 1817 | if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || |
1818 | !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || | ||
1819 | !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || | ||
1820 | !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || | ||
1793 | !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || | 1821 | !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || |
1794 | !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || | 1822 | !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || |
1795 | !init_heap(&ca->heap, free << 3, GFP_KERNEL) || | 1823 | !init_heap(&ca->heap, free << 3, GFP_KERNEL) || |
@@ -2034,7 +2062,8 @@ static void bcache_exit(void) | |||
2034 | kobject_put(bcache_kobj); | 2062 | kobject_put(bcache_kobj); |
2035 | if (bcache_wq) | 2063 | if (bcache_wq) |
2036 | destroy_workqueue(bcache_wq); | 2064 | destroy_workqueue(bcache_wq); |
2037 | unregister_blkdev(bcache_major, "bcache"); | 2065 | if (bcache_major) |
2066 | unregister_blkdev(bcache_major, "bcache"); | ||
2038 | unregister_reboot_notifier(&reboot); | 2067 | unregister_reboot_notifier(&reboot); |
2039 | } | 2068 | } |
2040 | 2069 | ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index a1f85612f0b3..d8458d477a12 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test); | |||
102 | rw_attribute(key_merging_disabled); | 102 | rw_attribute(key_merging_disabled); |
103 | rw_attribute(gc_always_rewrite); | 103 | rw_attribute(gc_always_rewrite); |
104 | rw_attribute(expensive_debug_checks); | 104 | rw_attribute(expensive_debug_checks); |
105 | rw_attribute(freelist_percent); | ||
106 | rw_attribute(cache_replacement_policy); | 105 | rw_attribute(cache_replacement_policy); |
107 | rw_attribute(btree_shrinker_disabled); | 106 | rw_attribute(btree_shrinker_disabled); |
108 | rw_attribute(copy_gc_enabled); | 107 | rw_attribute(copy_gc_enabled); |
@@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = { | |||
401 | }; | 400 | }; |
402 | KTYPE(bch_flash_dev); | 401 | KTYPE(bch_flash_dev); |
403 | 402 | ||
403 | struct bset_stats_op { | ||
404 | struct btree_op op; | ||
405 | size_t nodes; | ||
406 | struct bset_stats stats; | ||
407 | }; | ||
408 | |||
409 | static int btree_bset_stats(struct btree_op *b_op, struct btree *b) | ||
410 | { | ||
411 | struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); | ||
412 | |||
413 | op->nodes++; | ||
414 | bch_btree_keys_stats(&b->keys, &op->stats); | ||
415 | |||
416 | return MAP_CONTINUE; | ||
417 | } | ||
418 | |||
419 | static int bch_bset_print_stats(struct cache_set *c, char *buf) | ||
420 | { | ||
421 | struct bset_stats_op op; | ||
422 | int ret; | ||
423 | |||
424 | memset(&op, 0, sizeof(op)); | ||
425 | bch_btree_op_init(&op.op, -1); | ||
426 | |||
427 | ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); | ||
428 | if (ret < 0) | ||
429 | return ret; | ||
430 | |||
431 | return snprintf(buf, PAGE_SIZE, | ||
432 | "btree nodes: %zu\n" | ||
433 | "written sets: %zu\n" | ||
434 | "unwritten sets: %zu\n" | ||
435 | "written key bytes: %zu\n" | ||
436 | "unwritten key bytes: %zu\n" | ||
437 | "floats: %zu\n" | ||
438 | "failed: %zu\n", | ||
439 | op.nodes, | ||
440 | op.stats.sets_written, op.stats.sets_unwritten, | ||
441 | op.stats.bytes_written, op.stats.bytes_unwritten, | ||
442 | op.stats.floats, op.stats.failed); | ||
443 | } | ||
444 | |||
404 | SHOW(__bch_cache_set) | 445 | SHOW(__bch_cache_set) |
405 | { | 446 | { |
406 | unsigned root_usage(struct cache_set *c) | 447 | unsigned root_usage(struct cache_set *c) |
@@ -419,7 +460,7 @@ lock_root: | |||
419 | rw_lock(false, b, b->level); | 460 | rw_lock(false, b, b->level); |
420 | } while (b != c->root); | 461 | } while (b != c->root); |
421 | 462 | ||
422 | for_each_key_filter(b, k, &iter, bch_ptr_bad) | 463 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) |
423 | bytes += bkey_bytes(k); | 464 | bytes += bkey_bytes(k); |
424 | 465 | ||
425 | rw_unlock(false, b); | 466 | rw_unlock(false, b); |
@@ -434,7 +475,7 @@ lock_root: | |||
434 | 475 | ||
435 | mutex_lock(&c->bucket_lock); | 476 | mutex_lock(&c->bucket_lock); |
436 | list_for_each_entry(b, &c->btree_cache, list) | 477 | list_for_each_entry(b, &c->btree_cache, list) |
437 | ret += 1 << (b->page_order + PAGE_SHIFT); | 478 | ret += 1 << (b->keys.page_order + PAGE_SHIFT); |
438 | 479 | ||
439 | mutex_unlock(&c->bucket_lock); | 480 | mutex_unlock(&c->bucket_lock); |
440 | return ret; | 481 | return ret; |
@@ -491,7 +532,7 @@ lock_root: | |||
491 | 532 | ||
492 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); | 533 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); |
493 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); | 534 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); |
494 | sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); | 535 | sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); |
495 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); | 536 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); |
496 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); | 537 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); |
497 | 538 | ||
@@ -711,9 +752,6 @@ SHOW(__bch_cache) | |||
711 | sysfs_print(io_errors, | 752 | sysfs_print(io_errors, |
712 | atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); | 753 | atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); |
713 | 754 | ||
714 | sysfs_print(freelist_percent, ca->free.size * 100 / | ||
715 | ((size_t) ca->sb.nbuckets)); | ||
716 | |||
717 | if (attr == &sysfs_cache_replacement_policy) | 755 | if (attr == &sysfs_cache_replacement_policy) |
718 | return bch_snprint_string_list(buf, PAGE_SIZE, | 756 | return bch_snprint_string_list(buf, PAGE_SIZE, |
719 | cache_replacement_policies, | 757 | cache_replacement_policies, |
@@ -820,32 +858,6 @@ STORE(__bch_cache) | |||
820 | } | 858 | } |
821 | } | 859 | } |
822 | 860 | ||
823 | if (attr == &sysfs_freelist_percent) { | ||
824 | DECLARE_FIFO(long, free); | ||
825 | long i; | ||
826 | size_t p = strtoul_or_return(buf); | ||
827 | |||
828 | p = clamp_t(size_t, | ||
829 | ((size_t) ca->sb.nbuckets * p) / 100, | ||
830 | roundup_pow_of_two(ca->sb.nbuckets) >> 9, | ||
831 | ca->sb.nbuckets / 2); | ||
832 | |||
833 | if (!init_fifo_exact(&free, p, GFP_KERNEL)) | ||
834 | return -ENOMEM; | ||
835 | |||
836 | mutex_lock(&ca->set->bucket_lock); | ||
837 | |||
838 | fifo_move(&free, &ca->free); | ||
839 | fifo_swap(&free, &ca->free); | ||
840 | |||
841 | mutex_unlock(&ca->set->bucket_lock); | ||
842 | |||
843 | while (fifo_pop(&free, i)) | ||
844 | atomic_dec(&ca->buckets[i].pin); | ||
845 | |||
846 | free_fifo(&free); | ||
847 | } | ||
848 | |||
849 | if (attr == &sysfs_clear_stats) { | 861 | if (attr == &sysfs_clear_stats) { |
850 | atomic_long_set(&ca->sectors_written, 0); | 862 | atomic_long_set(&ca->sectors_written, 0); |
851 | atomic_long_set(&ca->btree_sectors_written, 0); | 863 | atomic_long_set(&ca->btree_sectors_written, 0); |
@@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = { | |||
869 | &sysfs_metadata_written, | 881 | &sysfs_metadata_written, |
870 | &sysfs_io_errors, | 882 | &sysfs_io_errors, |
871 | &sysfs_clear_stats, | 883 | &sysfs_clear_stats, |
872 | &sysfs_freelist_percent, | ||
873 | &sysfs_cache_replacement_policy, | 884 | &sysfs_cache_replacement_policy, |
874 | NULL | 885 | NULL |
875 | }; | 886 | }; |
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index bb37618e7664..db3ae4c2b223 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
@@ -224,10 +224,10 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) | |||
224 | 224 | ||
225 | void bch_bio_map(struct bio *bio, void *base) | 225 | void bch_bio_map(struct bio *bio, void *base) |
226 | { | 226 | { |
227 | size_t size = bio->bi_size; | 227 | size_t size = bio->bi_iter.bi_size; |
228 | struct bio_vec *bv = bio->bi_io_vec; | 228 | struct bio_vec *bv = bio->bi_io_vec; |
229 | 229 | ||
230 | BUG_ON(!bio->bi_size); | 230 | BUG_ON(!bio->bi_iter.bi_size); |
231 | BUG_ON(bio->bi_vcnt); | 231 | BUG_ON(bio->bi_vcnt); |
232 | 232 | ||
233 | bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; | 233 | bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; |
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1030c6020e98..ac7d0d1f70d7 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #ifndef _BCACHE_UTIL_H | 2 | #ifndef _BCACHE_UTIL_H |
3 | #define _BCACHE_UTIL_H | 3 | #define _BCACHE_UTIL_H |
4 | 4 | ||
5 | #include <linux/blkdev.h> | ||
5 | #include <linux/errno.h> | 6 | #include <linux/errno.h> |
6 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
7 | #include <linux/llist.h> | 8 | #include <linux/llist.h> |
@@ -17,11 +18,13 @@ struct closure; | |||
17 | 18 | ||
18 | #ifdef CONFIG_BCACHE_DEBUG | 19 | #ifdef CONFIG_BCACHE_DEBUG |
19 | 20 | ||
21 | #define EBUG_ON(cond) BUG_ON(cond) | ||
20 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) | 22 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) |
21 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) | 23 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) |
22 | 24 | ||
23 | #else /* DEBUG */ | 25 | #else /* DEBUG */ |
24 | 26 | ||
27 | #define EBUG_ON(cond) do { if (cond); } while (0) | ||
25 | #define atomic_dec_bug(v) atomic_dec(v) | 28 | #define atomic_dec_bug(v) atomic_dec(v) |
26 | #define atomic_inc_bug(v, i) atomic_inc(v) | 29 | #define atomic_inc_bug(v, i) atomic_inc(v) |
27 | 30 | ||
@@ -391,6 +394,11 @@ struct time_stats { | |||
391 | 394 | ||
392 | void bch_time_stats_update(struct time_stats *stats, uint64_t time); | 395 | void bch_time_stats_update(struct time_stats *stats, uint64_t time); |
393 | 396 | ||
397 | static inline unsigned local_clock_us(void) | ||
398 | { | ||
399 | return local_clock() >> 10; | ||
400 | } | ||
401 | |||
394 | #define NSEC_PER_ns 1L | 402 | #define NSEC_PER_ns 1L |
395 | #define NSEC_PER_us NSEC_PER_USEC | 403 | #define NSEC_PER_us NSEC_PER_USEC |
396 | #define NSEC_PER_ms NSEC_PER_MSEC | 404 | #define NSEC_PER_ms NSEC_PER_MSEC |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6c44fe059c27..f4300e4c0114 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -111,7 +111,7 @@ static void dirty_init(struct keybuf_key *w) | |||
111 | if (!io->dc->writeback_percent) | 111 | if (!io->dc->writeback_percent) |
112 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | 112 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); |
113 | 113 | ||
114 | bio->bi_size = KEY_SIZE(&w->key) << 9; | 114 | bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; |
115 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | 115 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); |
116 | bio->bi_private = w; | 116 | bio->bi_private = w; |
117 | bio->bi_io_vec = bio->bi_inline_vecs; | 117 | bio->bi_io_vec = bio->bi_inline_vecs; |
@@ -184,7 +184,7 @@ static void write_dirty(struct closure *cl) | |||
184 | 184 | ||
185 | dirty_init(w); | 185 | dirty_init(w); |
186 | io->bio.bi_rw = WRITE; | 186 | io->bio.bi_rw = WRITE; |
187 | io->bio.bi_sector = KEY_START(&w->key); | 187 | io->bio.bi_iter.bi_sector = KEY_START(&w->key); |
188 | io->bio.bi_bdev = io->dc->bdev; | 188 | io->bio.bi_bdev = io->dc->bdev; |
189 | io->bio.bi_end_io = dirty_endio; | 189 | io->bio.bi_end_io = dirty_endio; |
190 | 190 | ||
@@ -253,7 +253,7 @@ static void read_dirty(struct cached_dev *dc) | |||
253 | io->dc = dc; | 253 | io->dc = dc; |
254 | 254 | ||
255 | dirty_init(w); | 255 | dirty_init(w); |
256 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | 256 | io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); |
257 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | 257 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, |
258 | &w->key, 0)->bdev; | 258 | &w->key, 0)->bdev; |
259 | io->bio.bi_rw = READ; | 259 | io->bio.bi_rw = READ; |
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index c9ddcf4614b9..e2f8598937ac 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h | |||
@@ -50,7 +50,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, | |||
50 | return false; | 50 | return false; |
51 | 51 | ||
52 | if (dc->partial_stripes_expensive && | 52 | if (dc->partial_stripes_expensive && |
53 | bcache_dev_stripe_dirty(dc, bio->bi_sector, | 53 | bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector, |
54 | bio_sectors(bio))) | 54 | bio_sectors(bio))) |
55 | return true; | 55 | return true; |
56 | 56 | ||
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 12dc29ba7399..4195a01b1535 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -1635,7 +1635,7 @@ int bitmap_create(struct mddev *mddev) | |||
1635 | sector_t blocks = mddev->resync_max_sectors; | 1635 | sector_t blocks = mddev->resync_max_sectors; |
1636 | struct file *file = mddev->bitmap_info.file; | 1636 | struct file *file = mddev->bitmap_info.file; |
1637 | int err; | 1637 | int err; |
1638 | struct sysfs_dirent *bm = NULL; | 1638 | struct kernfs_node *bm = NULL; |
1639 | 1639 | ||
1640 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); | 1640 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); |
1641 | 1641 | ||
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index df4aeb6ac6f0..30210b9c4ef9 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -225,7 +225,7 @@ struct bitmap { | |||
225 | wait_queue_head_t overflow_wait; | 225 | wait_queue_head_t overflow_wait; |
226 | wait_queue_head_t behind_wait; | 226 | wait_queue_head_t behind_wait; |
227 | 227 | ||
228 | struct sysfs_dirent *sysfs_can_clear; | 228 | struct kernfs_node *sysfs_can_clear; |
229 | }; | 229 | }; |
230 | 230 | ||
231 | /* the bitmap API */ | 231 | /* the bitmap API */ |
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index 3a8cfa2645c7..dd3646111561 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h | |||
@@ -17,55 +17,24 @@ | |||
17 | * original bio state. | 17 | * original bio state. |
18 | */ | 18 | */ |
19 | 19 | ||
20 | struct dm_bio_vec_details { | ||
21 | #if PAGE_SIZE < 65536 | ||
22 | __u16 bv_len; | ||
23 | __u16 bv_offset; | ||
24 | #else | ||
25 | unsigned bv_len; | ||
26 | unsigned bv_offset; | ||
27 | #endif | ||
28 | }; | ||
29 | |||
30 | struct dm_bio_details { | 20 | struct dm_bio_details { |
31 | sector_t bi_sector; | ||
32 | struct block_device *bi_bdev; | 21 | struct block_device *bi_bdev; |
33 | unsigned int bi_size; | ||
34 | unsigned short bi_idx; | ||
35 | unsigned long bi_flags; | 22 | unsigned long bi_flags; |
36 | struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES]; | 23 | struct bvec_iter bi_iter; |
37 | }; | 24 | }; |
38 | 25 | ||
39 | static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) | 26 | static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) |
40 | { | 27 | { |
41 | unsigned i; | ||
42 | |||
43 | bd->bi_sector = bio->bi_sector; | ||
44 | bd->bi_bdev = bio->bi_bdev; | 28 | bd->bi_bdev = bio->bi_bdev; |
45 | bd->bi_size = bio->bi_size; | ||
46 | bd->bi_idx = bio->bi_idx; | ||
47 | bd->bi_flags = bio->bi_flags; | 29 | bd->bi_flags = bio->bi_flags; |
48 | 30 | bd->bi_iter = bio->bi_iter; | |
49 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
50 | bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len; | ||
51 | bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset; | ||
52 | } | ||
53 | } | 31 | } |
54 | 32 | ||
55 | static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) | 33 | static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) |
56 | { | 34 | { |
57 | unsigned i; | ||
58 | |||
59 | bio->bi_sector = bd->bi_sector; | ||
60 | bio->bi_bdev = bd->bi_bdev; | 35 | bio->bi_bdev = bd->bi_bdev; |
61 | bio->bi_size = bd->bi_size; | ||
62 | bio->bi_idx = bd->bi_idx; | ||
63 | bio->bi_flags = bd->bi_flags; | 36 | bio->bi_flags = bd->bi_flags; |
64 | 37 | bio->bi_iter = bd->bi_iter; | |
65 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
66 | bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len; | ||
67 | bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset; | ||
68 | } | ||
69 | } | 38 | } |
70 | 39 | ||
71 | #endif | 40 | #endif |
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 54bdd923316f..66c5d130c8c2 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
@@ -104,6 +104,8 @@ struct dm_bufio_client { | |||
104 | struct list_head reserved_buffers; | 104 | struct list_head reserved_buffers; |
105 | unsigned need_reserved_buffers; | 105 | unsigned need_reserved_buffers; |
106 | 106 | ||
107 | unsigned minimum_buffers; | ||
108 | |||
107 | struct hlist_head *cache_hash; | 109 | struct hlist_head *cache_hash; |
108 | wait_queue_head_t free_buffer_wait; | 110 | wait_queue_head_t free_buffer_wait; |
109 | 111 | ||
@@ -538,7 +540,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, | |||
538 | bio_init(&b->bio); | 540 | bio_init(&b->bio); |
539 | b->bio.bi_io_vec = b->bio_vec; | 541 | b->bio.bi_io_vec = b->bio_vec; |
540 | b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; | 542 | b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; |
541 | b->bio.bi_sector = block << b->c->sectors_per_block_bits; | 543 | b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; |
542 | b->bio.bi_bdev = b->c->bdev; | 544 | b->bio.bi_bdev = b->c->bdev; |
543 | b->bio.bi_end_io = end_io; | 545 | b->bio.bi_end_io = end_io; |
544 | 546 | ||
@@ -861,8 +863,8 @@ static void __get_memory_limit(struct dm_bufio_client *c, | |||
861 | buffers = dm_bufio_cache_size_per_client >> | 863 | buffers = dm_bufio_cache_size_per_client >> |
862 | (c->sectors_per_block_bits + SECTOR_SHIFT); | 864 | (c->sectors_per_block_bits + SECTOR_SHIFT); |
863 | 865 | ||
864 | if (buffers < DM_BUFIO_MIN_BUFFERS) | 866 | if (buffers < c->minimum_buffers) |
865 | buffers = DM_BUFIO_MIN_BUFFERS; | 867 | buffers = c->minimum_buffers; |
866 | 868 | ||
867 | *limit_buffers = buffers; | 869 | *limit_buffers = buffers; |
868 | *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; | 870 | *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; |
@@ -1350,6 +1352,34 @@ retry: | |||
1350 | } | 1352 | } |
1351 | EXPORT_SYMBOL_GPL(dm_bufio_release_move); | 1353 | EXPORT_SYMBOL_GPL(dm_bufio_release_move); |
1352 | 1354 | ||
1355 | /* | ||
1356 | * Free the given buffer. | ||
1357 | * | ||
1358 | * This is just a hint, if the buffer is in use or dirty, this function | ||
1359 | * does nothing. | ||
1360 | */ | ||
1361 | void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) | ||
1362 | { | ||
1363 | struct dm_buffer *b; | ||
1364 | |||
1365 | dm_bufio_lock(c); | ||
1366 | |||
1367 | b = __find(c, block); | ||
1368 | if (b && likely(!b->hold_count) && likely(!b->state)) { | ||
1369 | __unlink_buffer(b); | ||
1370 | __free_buffer_wake(b); | ||
1371 | } | ||
1372 | |||
1373 | dm_bufio_unlock(c); | ||
1374 | } | ||
1375 | EXPORT_SYMBOL(dm_bufio_forget); | ||
1376 | |||
1377 | void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) | ||
1378 | { | ||
1379 | c->minimum_buffers = n; | ||
1380 | } | ||
1381 | EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); | ||
1382 | |||
1353 | unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) | 1383 | unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) |
1354 | { | 1384 | { |
1355 | return c->block_size; | 1385 | return c->block_size; |
@@ -1546,6 +1576,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign | |||
1546 | INIT_LIST_HEAD(&c->reserved_buffers); | 1576 | INIT_LIST_HEAD(&c->reserved_buffers); |
1547 | c->need_reserved_buffers = reserved_buffers; | 1577 | c->need_reserved_buffers = reserved_buffers; |
1548 | 1578 | ||
1579 | c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; | ||
1580 | |||
1549 | init_waitqueue_head(&c->free_buffer_wait); | 1581 | init_waitqueue_head(&c->free_buffer_wait); |
1550 | c->async_write_error = 0; | 1582 | c->async_write_error = 0; |
1551 | 1583 | ||
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h index b142946a9e32..c096779a7292 100644 --- a/drivers/md/dm-bufio.h +++ b/drivers/md/dm-bufio.h | |||
@@ -108,6 +108,18 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c); | |||
108 | */ | 108 | */ |
109 | void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); | 109 | void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); |
110 | 110 | ||
111 | /* | ||
112 | * Free the given buffer. | ||
113 | * This is just a hint, if the buffer is in use or dirty, this function | ||
114 | * does nothing. | ||
115 | */ | ||
116 | void dm_bufio_forget(struct dm_bufio_client *c, sector_t block); | ||
117 | |||
118 | /* | ||
119 | * Set the minimum number of buffers before cleanup happens. | ||
120 | */ | ||
121 | void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n); | ||
122 | |||
111 | unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); | 123 | unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); |
112 | sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); | 124 | sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); |
113 | sector_t dm_bufio_get_block_number(struct dm_buffer *b); | 125 | sector_t dm_bufio_get_block_number(struct dm_buffer *b); |
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c new file mode 100644 index 000000000000..6c9049c51b2b --- /dev/null +++ b/drivers/md/dm-builtin.c | |||
@@ -0,0 +1,48 @@ | |||
1 | #include "dm.h" | ||
2 | |||
3 | /* | ||
4 | * The kobject release method must not be placed in the module itself, | ||
5 | * otherwise we are subject to module unload races. | ||
6 | * | ||
7 | * The release method is called when the last reference to the kobject is | ||
8 | * dropped. It may be called by any other kernel code that drops the last | ||
9 | * reference. | ||
10 | * | ||
11 | * The release method suffers from module unload race. We may prevent the | ||
12 | * module from being unloaded at the start of the release method (using | ||
13 | * increased module reference count or synchronizing against the release | ||
14 | * method), however there is no way to prevent the module from being | ||
15 | * unloaded at the end of the release method. | ||
16 | * | ||
17 | * If this code were placed in the dm module, the following race may | ||
18 | * happen: | ||
19 | * 1. Some other process takes a reference to dm kobject | ||
20 | * 2. The user issues ioctl function to unload the dm device | ||
21 | * 3. dm_sysfs_exit calls kobject_put, however the object is not released | ||
22 | * because of the other reference taken at step 1 | ||
23 | * 4. dm_sysfs_exit waits on the completion | ||
24 | * 5. The other process that took the reference in step 1 drops it, | ||
25 | * dm_kobject_release is called from this process | ||
26 | * 6. dm_kobject_release calls complete() | ||
27 | * 7. a reschedule happens before dm_kobject_release returns | ||
28 | * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference | ||
29 | * count is decremented | ||
30 | * 9. The user unloads the dm module | ||
31 | * 10. The other process that was rescheduled in step 7 continues to run, | ||
32 | * it is now executing code in unloaded module, so it crashes | ||
33 | * | ||
34 | * Note that if the process that takes the foreign reference to dm kobject | ||
35 | * has a low priority and the system is sufficiently loaded with | ||
36 | * higher-priority processes that prevent the low-priority process from | ||
37 | * being scheduled long enough, this bug may really happen. | ||
38 | * | ||
39 | * In order to fix this module unload race, we place the release method | ||
40 | * into a helper code that is compiled directly into the kernel. | ||
41 | */ | ||
42 | |||
43 | void dm_kobject_release(struct kobject *kobj) | ||
44 | { | ||
45 | complete(dm_get_completion_from_kobject(kobj)); | ||
46 | } | ||
47 | |||
48 | EXPORT_SYMBOL(dm_kobject_release); | ||
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 64780ad73bb0..0e385e40909e 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c | |||
@@ -72,7 +72,7 @@ static enum io_pattern iot_pattern(struct io_tracker *t) | |||
72 | 72 | ||
73 | static void iot_update_stats(struct io_tracker *t, struct bio *bio) | 73 | static void iot_update_stats(struct io_tracker *t, struct bio *bio) |
74 | { | 74 | { |
75 | if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1) | 75 | if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) |
76 | t->nr_seq_samples++; | 76 | t->nr_seq_samples++; |
77 | else { | 77 | else { |
78 | /* | 78 | /* |
@@ -87,7 +87,7 @@ static void iot_update_stats(struct io_tracker *t, struct bio *bio) | |||
87 | t->nr_rand_samples++; | 87 | t->nr_rand_samples++; |
88 | } | 88 | } |
89 | 89 | ||
90 | t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1); | 90 | t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); |
91 | } | 91 | } |
92 | 92 | ||
93 | static void iot_check_for_pattern_switch(struct io_tracker *t) | 93 | static void iot_check_for_pattern_switch(struct io_tracker *t) |
@@ -287,9 +287,8 @@ static struct entry *alloc_entry(struct entry_pool *ep) | |||
287 | static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) | 287 | static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) |
288 | { | 288 | { |
289 | struct entry *e = ep->entries + from_cblock(cblock); | 289 | struct entry *e = ep->entries + from_cblock(cblock); |
290 | list_del(&e->list); | ||
291 | 290 | ||
292 | INIT_LIST_HEAD(&e->list); | 291 | list_del_init(&e->list); |
293 | INIT_HLIST_NODE(&e->hlist); | 292 | INIT_HLIST_NODE(&e->hlist); |
294 | ep->nr_allocated++; | 293 | ep->nr_allocated++; |
295 | 294 | ||
@@ -391,6 +390,10 @@ struct mq_policy { | |||
391 | */ | 390 | */ |
392 | unsigned promote_threshold; | 391 | unsigned promote_threshold; |
393 | 392 | ||
393 | unsigned discard_promote_adjustment; | ||
394 | unsigned read_promote_adjustment; | ||
395 | unsigned write_promote_adjustment; | ||
396 | |||
394 | /* | 397 | /* |
395 | * The hash table allows us to quickly find an entry by origin | 398 | * The hash table allows us to quickly find an entry by origin |
396 | * block. Both pre_cache and cache entries are in here. | 399 | * block. Both pre_cache and cache entries are in here. |
@@ -400,6 +403,10 @@ struct mq_policy { | |||
400 | struct hlist_head *table; | 403 | struct hlist_head *table; |
401 | }; | 404 | }; |
402 | 405 | ||
406 | #define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 | ||
407 | #define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 | ||
408 | #define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 | ||
409 | |||
403 | /*----------------------------------------------------------------*/ | 410 | /*----------------------------------------------------------------*/ |
404 | 411 | ||
405 | /* | 412 | /* |
@@ -642,25 +649,21 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) | |||
642 | * We bias towards reads, since they can be demoted at no cost if they | 649 | * We bias towards reads, since they can be demoted at no cost if they |
643 | * haven't been dirtied. | 650 | * haven't been dirtied. |
644 | */ | 651 | */ |
645 | #define DISCARDED_PROMOTE_THRESHOLD 1 | ||
646 | #define READ_PROMOTE_THRESHOLD 4 | ||
647 | #define WRITE_PROMOTE_THRESHOLD 8 | ||
648 | |||
649 | static unsigned adjusted_promote_threshold(struct mq_policy *mq, | 652 | static unsigned adjusted_promote_threshold(struct mq_policy *mq, |
650 | bool discarded_oblock, int data_dir) | 653 | bool discarded_oblock, int data_dir) |
651 | { | 654 | { |
652 | if (data_dir == READ) | 655 | if (data_dir == READ) |
653 | return mq->promote_threshold + READ_PROMOTE_THRESHOLD; | 656 | return mq->promote_threshold + mq->read_promote_adjustment; |
654 | 657 | ||
655 | if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { | 658 | if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { |
656 | /* | 659 | /* |
657 | * We don't need to do any copying at all, so give this a | 660 | * We don't need to do any copying at all, so give this a |
658 | * very low threshold. | 661 | * very low threshold. |
659 | */ | 662 | */ |
660 | return DISCARDED_PROMOTE_THRESHOLD; | 663 | return mq->discard_promote_adjustment; |
661 | } | 664 | } |
662 | 665 | ||
663 | return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD; | 666 | return mq->promote_threshold + mq->write_promote_adjustment; |
664 | } | 667 | } |
665 | 668 | ||
666 | static bool should_promote(struct mq_policy *mq, struct entry *e, | 669 | static bool should_promote(struct mq_policy *mq, struct entry *e, |
@@ -809,7 +812,7 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, | |||
809 | bool can_migrate, bool discarded_oblock, | 812 | bool can_migrate, bool discarded_oblock, |
810 | int data_dir, struct policy_result *result) | 813 | int data_dir, struct policy_result *result) |
811 | { | 814 | { |
812 | if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) { | 815 | if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { |
813 | if (can_migrate) | 816 | if (can_migrate) |
814 | insert_in_cache(mq, oblock, result); | 817 | insert_in_cache(mq, oblock, result); |
815 | else | 818 | else |
@@ -869,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p) | |||
869 | { | 872 | { |
870 | struct mq_policy *mq = to_mq_policy(p); | 873 | struct mq_policy *mq = to_mq_policy(p); |
871 | 874 | ||
872 | kfree(mq->table); | 875 | vfree(mq->table); |
873 | epool_exit(&mq->cache_pool); | 876 | epool_exit(&mq->cache_pool); |
874 | epool_exit(&mq->pre_cache_pool); | 877 | epool_exit(&mq->pre_cache_pool); |
875 | kfree(mq); | 878 | kfree(mq); |
@@ -1135,20 +1138,28 @@ static int mq_set_config_value(struct dm_cache_policy *p, | |||
1135 | const char *key, const char *value) | 1138 | const char *key, const char *value) |
1136 | { | 1139 | { |
1137 | struct mq_policy *mq = to_mq_policy(p); | 1140 | struct mq_policy *mq = to_mq_policy(p); |
1138 | enum io_pattern pattern; | ||
1139 | unsigned long tmp; | 1141 | unsigned long tmp; |
1140 | 1142 | ||
1141 | if (!strcasecmp(key, "random_threshold")) | ||
1142 | pattern = PATTERN_RANDOM; | ||
1143 | else if (!strcasecmp(key, "sequential_threshold")) | ||
1144 | pattern = PATTERN_SEQUENTIAL; | ||
1145 | else | ||
1146 | return -EINVAL; | ||
1147 | |||
1148 | if (kstrtoul(value, 10, &tmp)) | 1143 | if (kstrtoul(value, 10, &tmp)) |
1149 | return -EINVAL; | 1144 | return -EINVAL; |
1150 | 1145 | ||
1151 | mq->tracker.thresholds[pattern] = tmp; | 1146 | if (!strcasecmp(key, "random_threshold")) { |
1147 | mq->tracker.thresholds[PATTERN_RANDOM] = tmp; | ||
1148 | |||
1149 | } else if (!strcasecmp(key, "sequential_threshold")) { | ||
1150 | mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; | ||
1151 | |||
1152 | } else if (!strcasecmp(key, "discard_promote_adjustment")) | ||
1153 | mq->discard_promote_adjustment = tmp; | ||
1154 | |||
1155 | else if (!strcasecmp(key, "read_promote_adjustment")) | ||
1156 | mq->read_promote_adjustment = tmp; | ||
1157 | |||
1158 | else if (!strcasecmp(key, "write_promote_adjustment")) | ||
1159 | mq->write_promote_adjustment = tmp; | ||
1160 | |||
1161 | else | ||
1162 | return -EINVAL; | ||
1152 | 1163 | ||
1153 | return 0; | 1164 | return 0; |
1154 | } | 1165 | } |
@@ -1158,9 +1169,16 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsign | |||
1158 | ssize_t sz = 0; | 1169 | ssize_t sz = 0; |
1159 | struct mq_policy *mq = to_mq_policy(p); | 1170 | struct mq_policy *mq = to_mq_policy(p); |
1160 | 1171 | ||
1161 | DMEMIT("4 random_threshold %u sequential_threshold %u", | 1172 | DMEMIT("10 random_threshold %u " |
1173 | "sequential_threshold %u " | ||
1174 | "discard_promote_adjustment %u " | ||
1175 | "read_promote_adjustment %u " | ||
1176 | "write_promote_adjustment %u", | ||
1162 | mq->tracker.thresholds[PATTERN_RANDOM], | 1177 | mq->tracker.thresholds[PATTERN_RANDOM], |
1163 | mq->tracker.thresholds[PATTERN_SEQUENTIAL]); | 1178 | mq->tracker.thresholds[PATTERN_SEQUENTIAL], |
1179 | mq->discard_promote_adjustment, | ||
1180 | mq->read_promote_adjustment, | ||
1181 | mq->write_promote_adjustment); | ||
1164 | 1182 | ||
1165 | return 0; | 1183 | return 0; |
1166 | } | 1184 | } |
@@ -1213,6 +1231,9 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | |||
1213 | mq->hit_count = 0; | 1231 | mq->hit_count = 0; |
1214 | mq->generation = 0; | 1232 | mq->generation = 0; |
1215 | mq->promote_threshold = 0; | 1233 | mq->promote_threshold = 0; |
1234 | mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; | ||
1235 | mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; | ||
1236 | mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; | ||
1216 | mutex_init(&mq->lock); | 1237 | mutex_init(&mq->lock); |
1217 | spin_lock_init(&mq->tick_lock); | 1238 | spin_lock_init(&mq->tick_lock); |
1218 | 1239 | ||
@@ -1224,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | |||
1224 | 1245 | ||
1225 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); | 1246 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); |
1226 | mq->hash_bits = ffs(mq->nr_buckets) - 1; | 1247 | mq->hash_bits = ffs(mq->nr_buckets) - 1; |
1227 | mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); | 1248 | mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); |
1228 | if (!mq->table) | 1249 | if (!mq->table) |
1229 | goto bad_alloc_table; | 1250 | goto bad_alloc_table; |
1230 | 1251 | ||
@@ -1244,7 +1265,7 @@ bad_pre_cache_init: | |||
1244 | 1265 | ||
1245 | static struct dm_cache_policy_type mq_policy_type = { | 1266 | static struct dm_cache_policy_type mq_policy_type = { |
1246 | .name = "mq", | 1267 | .name = "mq", |
1247 | .version = {1, 1, 0}, | 1268 | .version = {1, 2, 0}, |
1248 | .hint_size = 4, | 1269 | .hint_size = 4, |
1249 | .owner = THIS_MODULE, | 1270 | .owner = THIS_MODULE, |
1250 | .create = mq_create | 1271 | .create = mq_create |
@@ -1252,10 +1273,11 @@ static struct dm_cache_policy_type mq_policy_type = { | |||
1252 | 1273 | ||
1253 | static struct dm_cache_policy_type default_policy_type = { | 1274 | static struct dm_cache_policy_type default_policy_type = { |
1254 | .name = "default", | 1275 | .name = "default", |
1255 | .version = {1, 1, 0}, | 1276 | .version = {1, 2, 0}, |
1256 | .hint_size = 4, | 1277 | .hint_size = 4, |
1257 | .owner = THIS_MODULE, | 1278 | .owner = THIS_MODULE, |
1258 | .create = mq_create | 1279 | .create = mq_create, |
1280 | .real = &mq_policy_type | ||
1259 | }; | 1281 | }; |
1260 | 1282 | ||
1261 | static int __init mq_init(void) | 1283 | static int __init mq_init(void) |
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index d80057968407..c1a3cee99b44 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c | |||
@@ -146,6 +146,10 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p) | |||
146 | { | 146 | { |
147 | struct dm_cache_policy_type *t = p->private; | 147 | struct dm_cache_policy_type *t = p->private; |
148 | 148 | ||
149 | /* if t->real is set then an alias was used (e.g. "default") */ | ||
150 | if (t->real) | ||
151 | return t->real->name; | ||
152 | |||
149 | return t->name; | 153 | return t->name; |
150 | } | 154 | } |
151 | EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); | 155 | EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); |
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index 052c00a84a5c..f50fe360c546 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h | |||
@@ -223,6 +223,12 @@ struct dm_cache_policy_type { | |||
223 | unsigned version[CACHE_POLICY_VERSION_SIZE]; | 223 | unsigned version[CACHE_POLICY_VERSION_SIZE]; |
224 | 224 | ||
225 | /* | 225 | /* |
226 | * For use by an alias dm_cache_policy_type to point to the | ||
227 | * real dm_cache_policy_type. | ||
228 | */ | ||
229 | struct dm_cache_policy_type *real; | ||
230 | |||
231 | /* | ||
226 | * Policies may store a hint for each each cache block. | 232 | * Policies may store a hint for each each cache block. |
227 | * Currently the size of this hint must be 0 or 4 bytes but we | 233 | * Currently the size of this hint must be 0 or 4 bytes but we |
228 | * expect to relax this in future. | 234 | * expect to relax this in future. |
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1b1469ebe5cb..074b9c8e4cf0 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c | |||
@@ -85,6 +85,12 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) | |||
85 | { | 85 | { |
86 | bio->bi_end_io = h->bi_end_io; | 86 | bio->bi_end_io = h->bi_end_io; |
87 | bio->bi_private = h->bi_private; | 87 | bio->bi_private = h->bi_private; |
88 | |||
89 | /* | ||
90 | * Must bump bi_remaining to allow bio to complete with | ||
91 | * restored bi_end_io. | ||
92 | */ | ||
93 | atomic_inc(&bio->bi_remaining); | ||
88 | } | 94 | } |
89 | 95 | ||
90 | /*----------------------------------------------------------------*/ | 96 | /*----------------------------------------------------------------*/ |
@@ -283,6 +289,7 @@ struct per_bio_data { | |||
283 | bool tick:1; | 289 | bool tick:1; |
284 | unsigned req_nr:2; | 290 | unsigned req_nr:2; |
285 | struct dm_deferred_entry *all_io_entry; | 291 | struct dm_deferred_entry *all_io_entry; |
292 | struct dm_hook_info hook_info; | ||
286 | 293 | ||
287 | /* | 294 | /* |
288 | * writethrough fields. These MUST remain at the end of this | 295 | * writethrough fields. These MUST remain at the end of this |
@@ -291,7 +298,6 @@ struct per_bio_data { | |||
291 | */ | 298 | */ |
292 | struct cache *cache; | 299 | struct cache *cache; |
293 | dm_cblock_t cblock; | 300 | dm_cblock_t cblock; |
294 | struct dm_hook_info hook_info; | ||
295 | struct dm_bio_details bio_details; | 301 | struct dm_bio_details bio_details; |
296 | }; | 302 | }; |
297 | 303 | ||
@@ -664,15 +670,18 @@ static void remap_to_origin(struct cache *cache, struct bio *bio) | |||
664 | static void remap_to_cache(struct cache *cache, struct bio *bio, | 670 | static void remap_to_cache(struct cache *cache, struct bio *bio, |
665 | dm_cblock_t cblock) | 671 | dm_cblock_t cblock) |
666 | { | 672 | { |
667 | sector_t bi_sector = bio->bi_sector; | 673 | sector_t bi_sector = bio->bi_iter.bi_sector; |
674 | sector_t block = from_cblock(cblock); | ||
668 | 675 | ||
669 | bio->bi_bdev = cache->cache_dev->bdev; | 676 | bio->bi_bdev = cache->cache_dev->bdev; |
670 | if (!block_size_is_power_of_two(cache)) | 677 | if (!block_size_is_power_of_two(cache)) |
671 | bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + | 678 | bio->bi_iter.bi_sector = |
672 | sector_div(bi_sector, cache->sectors_per_block); | 679 | (block * cache->sectors_per_block) + |
680 | sector_div(bi_sector, cache->sectors_per_block); | ||
673 | else | 681 | else |
674 | bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | | 682 | bio->bi_iter.bi_sector = |
675 | (bi_sector & (cache->sectors_per_block - 1)); | 683 | (block << cache->sectors_per_block_shift) | |
684 | (bi_sector & (cache->sectors_per_block - 1)); | ||
676 | } | 685 | } |
677 | 686 | ||
678 | static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) | 687 | static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) |
@@ -712,7 +721,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, | |||
712 | 721 | ||
713 | static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) | 722 | static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) |
714 | { | 723 | { |
715 | sector_t block_nr = bio->bi_sector; | 724 | sector_t block_nr = bio->bi_iter.bi_sector; |
716 | 725 | ||
717 | if (!block_size_is_power_of_two(cache)) | 726 | if (!block_size_is_power_of_two(cache)) |
718 | (void) sector_div(block_nr, cache->sectors_per_block); | 727 | (void) sector_div(block_nr, cache->sectors_per_block); |
@@ -970,12 +979,13 @@ static void issue_copy_real(struct dm_cache_migration *mg) | |||
970 | int r; | 979 | int r; |
971 | struct dm_io_region o_region, c_region; | 980 | struct dm_io_region o_region, c_region; |
972 | struct cache *cache = mg->cache; | 981 | struct cache *cache = mg->cache; |
982 | sector_t cblock = from_cblock(mg->cblock); | ||
973 | 983 | ||
974 | o_region.bdev = cache->origin_dev->bdev; | 984 | o_region.bdev = cache->origin_dev->bdev; |
975 | o_region.count = cache->sectors_per_block; | 985 | o_region.count = cache->sectors_per_block; |
976 | 986 | ||
977 | c_region.bdev = cache->cache_dev->bdev; | 987 | c_region.bdev = cache->cache_dev->bdev; |
978 | c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; | 988 | c_region.sector = cblock * cache->sectors_per_block; |
979 | c_region.count = cache->sectors_per_block; | 989 | c_region.count = cache->sectors_per_block; |
980 | 990 | ||
981 | if (mg->writeback || mg->demote) { | 991 | if (mg->writeback || mg->demote) { |
@@ -1002,13 +1012,15 @@ static void overwrite_endio(struct bio *bio, int err) | |||
1002 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | 1012 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); |
1003 | unsigned long flags; | 1013 | unsigned long flags; |
1004 | 1014 | ||
1015 | dm_unhook_bio(&pb->hook_info, bio); | ||
1016 | |||
1005 | if (err) | 1017 | if (err) |
1006 | mg->err = true; | 1018 | mg->err = true; |
1007 | 1019 | ||
1020 | mg->requeue_holder = false; | ||
1021 | |||
1008 | spin_lock_irqsave(&cache->lock, flags); | 1022 | spin_lock_irqsave(&cache->lock, flags); |
1009 | list_add_tail(&mg->list, &cache->completed_migrations); | 1023 | list_add_tail(&mg->list, &cache->completed_migrations); |
1010 | dm_unhook_bio(&pb->hook_info, bio); | ||
1011 | mg->requeue_holder = false; | ||
1012 | spin_unlock_irqrestore(&cache->lock, flags); | 1024 | spin_unlock_irqrestore(&cache->lock, flags); |
1013 | 1025 | ||
1014 | wake_worker(cache); | 1026 | wake_worker(cache); |
@@ -1027,7 +1039,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) | |||
1027 | static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) | 1039 | static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) |
1028 | { | 1040 | { |
1029 | return (bio_data_dir(bio) == WRITE) && | 1041 | return (bio_data_dir(bio) == WRITE) && |
1030 | (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); | 1042 | (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); |
1031 | } | 1043 | } |
1032 | 1044 | ||
1033 | static void avoid_copy(struct dm_cache_migration *mg) | 1045 | static void avoid_copy(struct dm_cache_migration *mg) |
@@ -1252,7 +1264,7 @@ static void process_flush_bio(struct cache *cache, struct bio *bio) | |||
1252 | size_t pb_data_size = get_per_bio_data_size(cache); | 1264 | size_t pb_data_size = get_per_bio_data_size(cache); |
1253 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | 1265 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); |
1254 | 1266 | ||
1255 | BUG_ON(bio->bi_size); | 1267 | BUG_ON(bio->bi_iter.bi_size); |
1256 | if (!pb->req_nr) | 1268 | if (!pb->req_nr) |
1257 | remap_to_origin(cache, bio); | 1269 | remap_to_origin(cache, bio); |
1258 | else | 1270 | else |
@@ -1275,9 +1287,9 @@ static void process_flush_bio(struct cache *cache, struct bio *bio) | |||
1275 | */ | 1287 | */ |
1276 | static void process_discard_bio(struct cache *cache, struct bio *bio) | 1288 | static void process_discard_bio(struct cache *cache, struct bio *bio) |
1277 | { | 1289 | { |
1278 | dm_block_t start_block = dm_sector_div_up(bio->bi_sector, | 1290 | dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, |
1279 | cache->discard_block_size); | 1291 | cache->discard_block_size); |
1280 | dm_block_t end_block = bio->bi_sector + bio_sectors(bio); | 1292 | dm_block_t end_block = bio_end_sector(bio); |
1281 | dm_block_t b; | 1293 | dm_block_t b; |
1282 | 1294 | ||
1283 | end_block = block_div(end_block, cache->discard_block_size); | 1295 | end_block = block_div(end_block, cache->discard_block_size); |
@@ -2453,20 +2465,18 @@ static int cache_map(struct dm_target *ti, struct bio *bio) | |||
2453 | bool discarded_block; | 2465 | bool discarded_block; |
2454 | struct dm_bio_prison_cell *cell; | 2466 | struct dm_bio_prison_cell *cell; |
2455 | struct policy_result lookup_result; | 2467 | struct policy_result lookup_result; |
2456 | struct per_bio_data *pb; | 2468 | struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); |
2457 | 2469 | ||
2458 | if (from_oblock(block) > from_oblock(cache->origin_blocks)) { | 2470 | if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { |
2459 | /* | 2471 | /* |
2460 | * This can only occur if the io goes to a partial block at | 2472 | * This can only occur if the io goes to a partial block at |
2461 | * the end of the origin device. We don't cache these. | 2473 | * the end of the origin device. We don't cache these. |
2462 | * Just remap to the origin and carry on. | 2474 | * Just remap to the origin and carry on. |
2463 | */ | 2475 | */ |
2464 | remap_to_origin_clear_discard(cache, bio, block); | 2476 | remap_to_origin(cache, bio); |
2465 | return DM_MAPIO_REMAPPED; | 2477 | return DM_MAPIO_REMAPPED; |
2466 | } | 2478 | } |
2467 | 2479 | ||
2468 | pb = init_per_bio_data(bio, pb_data_size); | ||
2469 | |||
2470 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { | 2480 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { |
2471 | defer_bio(cache, bio); | 2481 | defer_bio(cache, bio); |
2472 | return DM_MAPIO_SUBMITTED; | 2482 | return DM_MAPIO_SUBMITTED; |
@@ -2826,12 +2836,13 @@ static void cache_resume(struct dm_target *ti) | |||
2826 | /* | 2836 | /* |
2827 | * Status format: | 2837 | * Status format: |
2828 | * | 2838 | * |
2829 | * <#used metadata blocks>/<#total metadata blocks> | 2839 | * <metadata block size> <#used metadata blocks>/<#total metadata blocks> |
2840 | * <cache block size> <#used cache blocks>/<#total cache blocks> | ||
2830 | * <#read hits> <#read misses> <#write hits> <#write misses> | 2841 | * <#read hits> <#read misses> <#write hits> <#write misses> |
2831 | * <#demotions> <#promotions> <#blocks in cache> <#dirty> | 2842 | * <#demotions> <#promotions> <#dirty> |
2832 | * <#features> <features>* | 2843 | * <#features> <features>* |
2833 | * <#core args> <core args> | 2844 | * <#core args> <core args> |
2834 | * <#policy args> <policy args>* | 2845 | * <policy name> <#policy args> <policy args>* |
2835 | */ | 2846 | */ |
2836 | static void cache_status(struct dm_target *ti, status_type_t type, | 2847 | static void cache_status(struct dm_target *ti, status_type_t type, |
2837 | unsigned status_flags, char *result, unsigned maxlen) | 2848 | unsigned status_flags, char *result, unsigned maxlen) |
@@ -2869,17 +2880,20 @@ static void cache_status(struct dm_target *ti, status_type_t type, | |||
2869 | 2880 | ||
2870 | residency = policy_residency(cache->policy); | 2881 | residency = policy_residency(cache->policy); |
2871 | 2882 | ||
2872 | DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", | 2883 | DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ", |
2884 | (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), | ||
2873 | (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), | 2885 | (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), |
2874 | (unsigned long long)nr_blocks_metadata, | 2886 | (unsigned long long)nr_blocks_metadata, |
2887 | cache->sectors_per_block, | ||
2888 | (unsigned long long) from_cblock(residency), | ||
2889 | (unsigned long long) from_cblock(cache->cache_size), | ||
2875 | (unsigned) atomic_read(&cache->stats.read_hit), | 2890 | (unsigned) atomic_read(&cache->stats.read_hit), |
2876 | (unsigned) atomic_read(&cache->stats.read_miss), | 2891 | (unsigned) atomic_read(&cache->stats.read_miss), |
2877 | (unsigned) atomic_read(&cache->stats.write_hit), | 2892 | (unsigned) atomic_read(&cache->stats.write_hit), |
2878 | (unsigned) atomic_read(&cache->stats.write_miss), | 2893 | (unsigned) atomic_read(&cache->stats.write_miss), |
2879 | (unsigned) atomic_read(&cache->stats.demotion), | 2894 | (unsigned) atomic_read(&cache->stats.demotion), |
2880 | (unsigned) atomic_read(&cache->stats.promotion), | 2895 | (unsigned) atomic_read(&cache->stats.promotion), |
2881 | (unsigned long long) from_cblock(residency), | 2896 | (unsigned long long) from_cblock(cache->nr_dirty)); |
2882 | cache->nr_dirty); | ||
2883 | 2897 | ||
2884 | if (writethrough_mode(&cache->features)) | 2898 | if (writethrough_mode(&cache->features)) |
2885 | DMEMIT("1 writethrough "); | 2899 | DMEMIT("1 writethrough "); |
@@ -2896,6 +2910,8 @@ static void cache_status(struct dm_target *ti, status_type_t type, | |||
2896 | } | 2910 | } |
2897 | 2911 | ||
2898 | DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); | 2912 | DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); |
2913 | |||
2914 | DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); | ||
2899 | if (sz < maxlen) { | 2915 | if (sz < maxlen) { |
2900 | r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); | 2916 | r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); |
2901 | if (r) | 2917 | if (r) |
@@ -3129,7 +3145,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
3129 | 3145 | ||
3130 | static struct target_type cache_target = { | 3146 | static struct target_type cache_target = { |
3131 | .name = "cache", | 3147 | .name = "cache", |
3132 | .version = {1, 2, 0}, | 3148 | .version = {1, 3, 0}, |
3133 | .module = THIS_MODULE, | 3149 | .module = THIS_MODULE, |
3134 | .ctr = cache_ctr, | 3150 | .ctr = cache_ctr, |
3135 | .dtr = cache_dtr, | 3151 | .dtr = cache_dtr, |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 81b0fa660452..784695d22fde 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -39,10 +39,8 @@ struct convert_context { | |||
39 | struct completion restart; | 39 | struct completion restart; |
40 | struct bio *bio_in; | 40 | struct bio *bio_in; |
41 | struct bio *bio_out; | 41 | struct bio *bio_out; |
42 | unsigned int offset_in; | 42 | struct bvec_iter iter_in; |
43 | unsigned int offset_out; | 43 | struct bvec_iter iter_out; |
44 | unsigned int idx_in; | ||
45 | unsigned int idx_out; | ||
46 | sector_t cc_sector; | 44 | sector_t cc_sector; |
47 | atomic_t cc_pending; | 45 | atomic_t cc_pending; |
48 | }; | 46 | }; |
@@ -826,10 +824,10 @@ static void crypt_convert_init(struct crypt_config *cc, | |||
826 | { | 824 | { |
827 | ctx->bio_in = bio_in; | 825 | ctx->bio_in = bio_in; |
828 | ctx->bio_out = bio_out; | 826 | ctx->bio_out = bio_out; |
829 | ctx->offset_in = 0; | 827 | if (bio_in) |
830 | ctx->offset_out = 0; | 828 | ctx->iter_in = bio_in->bi_iter; |
831 | ctx->idx_in = bio_in ? bio_in->bi_idx : 0; | 829 | if (bio_out) |
832 | ctx->idx_out = bio_out ? bio_out->bi_idx : 0; | 830 | ctx->iter_out = bio_out->bi_iter; |
833 | ctx->cc_sector = sector + cc->iv_offset; | 831 | ctx->cc_sector = sector + cc->iv_offset; |
834 | init_completion(&ctx->restart); | 832 | init_completion(&ctx->restart); |
835 | } | 833 | } |
@@ -857,8 +855,8 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
857 | struct convert_context *ctx, | 855 | struct convert_context *ctx, |
858 | struct ablkcipher_request *req) | 856 | struct ablkcipher_request *req) |
859 | { | 857 | { |
860 | struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); | 858 | struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); |
861 | struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); | 859 | struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); |
862 | struct dm_crypt_request *dmreq; | 860 | struct dm_crypt_request *dmreq; |
863 | u8 *iv; | 861 | u8 *iv; |
864 | int r; | 862 | int r; |
@@ -869,24 +867,15 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
869 | dmreq->iv_sector = ctx->cc_sector; | 867 | dmreq->iv_sector = ctx->cc_sector; |
870 | dmreq->ctx = ctx; | 868 | dmreq->ctx = ctx; |
871 | sg_init_table(&dmreq->sg_in, 1); | 869 | sg_init_table(&dmreq->sg_in, 1); |
872 | sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, | 870 | sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT, |
873 | bv_in->bv_offset + ctx->offset_in); | 871 | bv_in.bv_offset); |
874 | 872 | ||
875 | sg_init_table(&dmreq->sg_out, 1); | 873 | sg_init_table(&dmreq->sg_out, 1); |
876 | sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, | 874 | sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT, |
877 | bv_out->bv_offset + ctx->offset_out); | 875 | bv_out.bv_offset); |
878 | 876 | ||
879 | ctx->offset_in += 1 << SECTOR_SHIFT; | 877 | bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT); |
880 | if (ctx->offset_in >= bv_in->bv_len) { | 878 | bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT); |
881 | ctx->offset_in = 0; | ||
882 | ctx->idx_in++; | ||
883 | } | ||
884 | |||
885 | ctx->offset_out += 1 << SECTOR_SHIFT; | ||
886 | if (ctx->offset_out >= bv_out->bv_len) { | ||
887 | ctx->offset_out = 0; | ||
888 | ctx->idx_out++; | ||
889 | } | ||
890 | 879 | ||
891 | if (cc->iv_gen_ops) { | 880 | if (cc->iv_gen_ops) { |
892 | r = cc->iv_gen_ops->generator(cc, iv, dmreq); | 881 | r = cc->iv_gen_ops->generator(cc, iv, dmreq); |
@@ -937,8 +926,7 @@ static int crypt_convert(struct crypt_config *cc, | |||
937 | 926 | ||
938 | atomic_set(&ctx->cc_pending, 1); | 927 | atomic_set(&ctx->cc_pending, 1); |
939 | 928 | ||
940 | while(ctx->idx_in < ctx->bio_in->bi_vcnt && | 929 | while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { |
941 | ctx->idx_out < ctx->bio_out->bi_vcnt) { | ||
942 | 930 | ||
943 | crypt_alloc_req(cc, ctx); | 931 | crypt_alloc_req(cc, ctx); |
944 | 932 | ||
@@ -1021,7 +1009,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, | |||
1021 | size -= len; | 1009 | size -= len; |
1022 | } | 1010 | } |
1023 | 1011 | ||
1024 | if (!clone->bi_size) { | 1012 | if (!clone->bi_iter.bi_size) { |
1025 | bio_put(clone); | 1013 | bio_put(clone); |
1026 | return NULL; | 1014 | return NULL; |
1027 | } | 1015 | } |
@@ -1161,7 +1149,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) | |||
1161 | crypt_inc_pending(io); | 1149 | crypt_inc_pending(io); |
1162 | 1150 | ||
1163 | clone_init(io, clone); | 1151 | clone_init(io, clone); |
1164 | clone->bi_sector = cc->start + io->sector; | 1152 | clone->bi_iter.bi_sector = cc->start + io->sector; |
1165 | 1153 | ||
1166 | generic_make_request(clone); | 1154 | generic_make_request(clone); |
1167 | return 0; | 1155 | return 0; |
@@ -1207,9 +1195,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) | |||
1207 | } | 1195 | } |
1208 | 1196 | ||
1209 | /* crypt_convert should have filled the clone bio */ | 1197 | /* crypt_convert should have filled the clone bio */ |
1210 | BUG_ON(io->ctx.idx_out < clone->bi_vcnt); | 1198 | BUG_ON(io->ctx.iter_out.bi_size); |
1211 | 1199 | ||
1212 | clone->bi_sector = cc->start + io->sector; | 1200 | clone->bi_iter.bi_sector = cc->start + io->sector; |
1213 | 1201 | ||
1214 | if (async) | 1202 | if (async) |
1215 | kcryptd_queue_io(io); | 1203 | kcryptd_queue_io(io); |
@@ -1224,7 +1212,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1224 | struct dm_crypt_io *new_io; | 1212 | struct dm_crypt_io *new_io; |
1225 | int crypt_finished; | 1213 | int crypt_finished; |
1226 | unsigned out_of_pages = 0; | 1214 | unsigned out_of_pages = 0; |
1227 | unsigned remaining = io->base_bio->bi_size; | 1215 | unsigned remaining = io->base_bio->bi_iter.bi_size; |
1228 | sector_t sector = io->sector; | 1216 | sector_t sector = io->sector; |
1229 | int r; | 1217 | int r; |
1230 | 1218 | ||
@@ -1246,9 +1234,9 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1246 | } | 1234 | } |
1247 | 1235 | ||
1248 | io->ctx.bio_out = clone; | 1236 | io->ctx.bio_out = clone; |
1249 | io->ctx.idx_out = 0; | 1237 | io->ctx.iter_out = clone->bi_iter; |
1250 | 1238 | ||
1251 | remaining -= clone->bi_size; | 1239 | remaining -= clone->bi_iter.bi_size; |
1252 | sector += bio_sectors(clone); | 1240 | sector += bio_sectors(clone); |
1253 | 1241 | ||
1254 | crypt_inc_pending(io); | 1242 | crypt_inc_pending(io); |
@@ -1290,8 +1278,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1290 | crypt_inc_pending(new_io); | 1278 | crypt_inc_pending(new_io); |
1291 | crypt_convert_init(cc, &new_io->ctx, NULL, | 1279 | crypt_convert_init(cc, &new_io->ctx, NULL, |
1292 | io->base_bio, sector); | 1280 | io->base_bio, sector); |
1293 | new_io->ctx.idx_in = io->ctx.idx_in; | 1281 | new_io->ctx.iter_in = io->ctx.iter_in; |
1294 | new_io->ctx.offset_in = io->ctx.offset_in; | ||
1295 | 1282 | ||
1296 | /* | 1283 | /* |
1297 | * Fragments after the first use the base_io | 1284 | * Fragments after the first use the base_io |
@@ -1869,11 +1856,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) | |||
1869 | if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { | 1856 | if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { |
1870 | bio->bi_bdev = cc->dev->bdev; | 1857 | bio->bi_bdev = cc->dev->bdev; |
1871 | if (bio_sectors(bio)) | 1858 | if (bio_sectors(bio)) |
1872 | bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); | 1859 | bio->bi_iter.bi_sector = cc->start + |
1860 | dm_target_offset(ti, bio->bi_iter.bi_sector); | ||
1873 | return DM_MAPIO_REMAPPED; | 1861 | return DM_MAPIO_REMAPPED; |
1874 | } | 1862 | } |
1875 | 1863 | ||
1876 | io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector)); | 1864 | io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); |
1877 | 1865 | ||
1878 | if (bio_data_dir(io->base_bio) == READ) { | 1866 | if (bio_data_dir(io->base_bio) == READ) { |
1879 | if (kcryptd_io_read(io, GFP_NOWAIT)) | 1867 | if (kcryptd_io_read(io, GFP_NOWAIT)) |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 2f91d6d4a2cc..42c3a27a14cc 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -24,7 +24,6 @@ struct delay_c { | |||
24 | struct work_struct flush_expired_bios; | 24 | struct work_struct flush_expired_bios; |
25 | struct list_head delayed_bios; | 25 | struct list_head delayed_bios; |
26 | atomic_t may_delay; | 26 | atomic_t may_delay; |
27 | mempool_t *delayed_pool; | ||
28 | 27 | ||
29 | struct dm_dev *dev_read; | 28 | struct dm_dev *dev_read; |
30 | sector_t start_read; | 29 | sector_t start_read; |
@@ -40,14 +39,11 @@ struct delay_c { | |||
40 | struct dm_delay_info { | 39 | struct dm_delay_info { |
41 | struct delay_c *context; | 40 | struct delay_c *context; |
42 | struct list_head list; | 41 | struct list_head list; |
43 | struct bio *bio; | ||
44 | unsigned long expires; | 42 | unsigned long expires; |
45 | }; | 43 | }; |
46 | 44 | ||
47 | static DEFINE_MUTEX(delayed_bios_lock); | 45 | static DEFINE_MUTEX(delayed_bios_lock); |
48 | 46 | ||
49 | static struct kmem_cache *delayed_cache; | ||
50 | |||
51 | static void handle_delayed_timer(unsigned long data) | 47 | static void handle_delayed_timer(unsigned long data) |
52 | { | 48 | { |
53 | struct delay_c *dc = (struct delay_c *)data; | 49 | struct delay_c *dc = (struct delay_c *)data; |
@@ -87,13 +83,14 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) | |||
87 | mutex_lock(&delayed_bios_lock); | 83 | mutex_lock(&delayed_bios_lock); |
88 | list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { | 84 | list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { |
89 | if (flush_all || time_after_eq(jiffies, delayed->expires)) { | 85 | if (flush_all || time_after_eq(jiffies, delayed->expires)) { |
86 | struct bio *bio = dm_bio_from_per_bio_data(delayed, | ||
87 | sizeof(struct dm_delay_info)); | ||
90 | list_del(&delayed->list); | 88 | list_del(&delayed->list); |
91 | bio_list_add(&flush_bios, delayed->bio); | 89 | bio_list_add(&flush_bios, bio); |
92 | if ((bio_data_dir(delayed->bio) == WRITE)) | 90 | if ((bio_data_dir(bio) == WRITE)) |
93 | delayed->context->writes--; | 91 | delayed->context->writes--; |
94 | else | 92 | else |
95 | delayed->context->reads--; | 93 | delayed->context->reads--; |
96 | mempool_free(delayed, dc->delayed_pool); | ||
97 | continue; | 94 | continue; |
98 | } | 95 | } |
99 | 96 | ||
@@ -185,12 +182,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
185 | } | 182 | } |
186 | 183 | ||
187 | out: | 184 | out: |
188 | dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache); | ||
189 | if (!dc->delayed_pool) { | ||
190 | DMERR("Couldn't create delayed bio pool."); | ||
191 | goto bad_dev_write; | ||
192 | } | ||
193 | |||
194 | dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); | 185 | dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); |
195 | if (!dc->kdelayd_wq) { | 186 | if (!dc->kdelayd_wq) { |
196 | DMERR("Couldn't start kdelayd"); | 187 | DMERR("Couldn't start kdelayd"); |
@@ -206,12 +197,11 @@ out: | |||
206 | 197 | ||
207 | ti->num_flush_bios = 1; | 198 | ti->num_flush_bios = 1; |
208 | ti->num_discard_bios = 1; | 199 | ti->num_discard_bios = 1; |
200 | ti->per_bio_data_size = sizeof(struct dm_delay_info); | ||
209 | ti->private = dc; | 201 | ti->private = dc; |
210 | return 0; | 202 | return 0; |
211 | 203 | ||
212 | bad_queue: | 204 | bad_queue: |
213 | mempool_destroy(dc->delayed_pool); | ||
214 | bad_dev_write: | ||
215 | if (dc->dev_write) | 205 | if (dc->dev_write) |
216 | dm_put_device(ti, dc->dev_write); | 206 | dm_put_device(ti, dc->dev_write); |
217 | bad_dev_read: | 207 | bad_dev_read: |
@@ -232,7 +222,6 @@ static void delay_dtr(struct dm_target *ti) | |||
232 | if (dc->dev_write) | 222 | if (dc->dev_write) |
233 | dm_put_device(ti, dc->dev_write); | 223 | dm_put_device(ti, dc->dev_write); |
234 | 224 | ||
235 | mempool_destroy(dc->delayed_pool); | ||
236 | kfree(dc); | 225 | kfree(dc); |
237 | } | 226 | } |
238 | 227 | ||
@@ -244,10 +233,9 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) | |||
244 | if (!delay || !atomic_read(&dc->may_delay)) | 233 | if (!delay || !atomic_read(&dc->may_delay)) |
245 | return 1; | 234 | return 1; |
246 | 235 | ||
247 | delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO); | 236 | delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); |
248 | 237 | ||
249 | delayed->context = dc; | 238 | delayed->context = dc; |
250 | delayed->bio = bio; | ||
251 | delayed->expires = expires = jiffies + (delay * HZ / 1000); | 239 | delayed->expires = expires = jiffies + (delay * HZ / 1000); |
252 | 240 | ||
253 | mutex_lock(&delayed_bios_lock); | 241 | mutex_lock(&delayed_bios_lock); |
@@ -289,14 +277,15 @@ static int delay_map(struct dm_target *ti, struct bio *bio) | |||
289 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { | 277 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { |
290 | bio->bi_bdev = dc->dev_write->bdev; | 278 | bio->bi_bdev = dc->dev_write->bdev; |
291 | if (bio_sectors(bio)) | 279 | if (bio_sectors(bio)) |
292 | bio->bi_sector = dc->start_write + | 280 | bio->bi_iter.bi_sector = dc->start_write + |
293 | dm_target_offset(ti, bio->bi_sector); | 281 | dm_target_offset(ti, bio->bi_iter.bi_sector); |
294 | 282 | ||
295 | return delay_bio(dc, dc->write_delay, bio); | 283 | return delay_bio(dc, dc->write_delay, bio); |
296 | } | 284 | } |
297 | 285 | ||
298 | bio->bi_bdev = dc->dev_read->bdev; | 286 | bio->bi_bdev = dc->dev_read->bdev; |
299 | bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector); | 287 | bio->bi_iter.bi_sector = dc->start_read + |
288 | dm_target_offset(ti, bio->bi_iter.bi_sector); | ||
300 | 289 | ||
301 | return delay_bio(dc, dc->read_delay, bio); | 290 | return delay_bio(dc, dc->read_delay, bio); |
302 | } | 291 | } |
@@ -356,13 +345,7 @@ static struct target_type delay_target = { | |||
356 | 345 | ||
357 | static int __init dm_delay_init(void) | 346 | static int __init dm_delay_init(void) |
358 | { | 347 | { |
359 | int r = -ENOMEM; | 348 | int r; |
360 | |||
361 | delayed_cache = KMEM_CACHE(dm_delay_info, 0); | ||
362 | if (!delayed_cache) { | ||
363 | DMERR("Couldn't create delayed bio cache."); | ||
364 | goto bad_memcache; | ||
365 | } | ||
366 | 349 | ||
367 | r = dm_register_target(&delay_target); | 350 | r = dm_register_target(&delay_target); |
368 | if (r < 0) { | 351 | if (r < 0) { |
@@ -373,15 +356,12 @@ static int __init dm_delay_init(void) | |||
373 | return 0; | 356 | return 0; |
374 | 357 | ||
375 | bad_register: | 358 | bad_register: |
376 | kmem_cache_destroy(delayed_cache); | ||
377 | bad_memcache: | ||
378 | return r; | 359 | return r; |
379 | } | 360 | } |
380 | 361 | ||
381 | static void __exit dm_delay_exit(void) | 362 | static void __exit dm_delay_exit(void) |
382 | { | 363 | { |
383 | dm_unregister_target(&delay_target); | 364 | dm_unregister_target(&delay_target); |
384 | kmem_cache_destroy(delayed_cache); | ||
385 | } | 365 | } |
386 | 366 | ||
387 | /* Module hooks */ | 367 | /* Module hooks */ |
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index c80a0ec5f126..b257e46876d3 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c | |||
@@ -248,7 +248,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) | |||
248 | 248 | ||
249 | bio->bi_bdev = fc->dev->bdev; | 249 | bio->bi_bdev = fc->dev->bdev; |
250 | if (bio_sectors(bio)) | 250 | if (bio_sectors(bio)) |
251 | bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); | 251 | bio->bi_iter.bi_sector = |
252 | flakey_map_sector(ti, bio->bi_iter.bi_sector); | ||
252 | } | 253 | } |
253 | 254 | ||
254 | static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) | 255 | static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) |
@@ -265,8 +266,8 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) | |||
265 | DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " | 266 | DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " |
266 | "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", | 267 | "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", |
267 | bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, | 268 | bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, |
268 | (bio_data_dir(bio) == WRITE) ? 'w' : 'r', | 269 | (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw, |
269 | bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes); | 270 | (unsigned long long)bio->bi_iter.bi_sector, bio_bytes); |
270 | } | 271 | } |
271 | } | 272 | } |
272 | 273 | ||
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 2a20986a2fec..3842ac738f98 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -201,26 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse | |||
201 | /* | 201 | /* |
202 | * Functions for getting the pages from a bvec. | 202 | * Functions for getting the pages from a bvec. |
203 | */ | 203 | */ |
204 | static void bvec_get_page(struct dpages *dp, | 204 | static void bio_get_page(struct dpages *dp, struct page **p, |
205 | struct page **p, unsigned long *len, unsigned *offset) | 205 | unsigned long *len, unsigned *offset) |
206 | { | 206 | { |
207 | struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; | 207 | struct bio_vec *bvec = dp->context_ptr; |
208 | *p = bvec->bv_page; | 208 | *p = bvec->bv_page; |
209 | *len = bvec->bv_len; | 209 | *len = bvec->bv_len - dp->context_u; |
210 | *offset = bvec->bv_offset; | 210 | *offset = bvec->bv_offset + dp->context_u; |
211 | } | 211 | } |
212 | 212 | ||
213 | static void bvec_next_page(struct dpages *dp) | 213 | static void bio_next_page(struct dpages *dp) |
214 | { | 214 | { |
215 | struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; | 215 | struct bio_vec *bvec = dp->context_ptr; |
216 | dp->context_ptr = bvec + 1; | 216 | dp->context_ptr = bvec + 1; |
217 | dp->context_u = 0; | ||
217 | } | 218 | } |
218 | 219 | ||
219 | static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) | 220 | static void bio_dp_init(struct dpages *dp, struct bio *bio) |
220 | { | 221 | { |
221 | dp->get_page = bvec_get_page; | 222 | dp->get_page = bio_get_page; |
222 | dp->next_page = bvec_next_page; | 223 | dp->next_page = bio_next_page; |
223 | dp->context_ptr = bvec; | 224 | dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); |
225 | dp->context_u = bio->bi_iter.bi_bvec_done; | ||
224 | } | 226 | } |
225 | 227 | ||
226 | /* | 228 | /* |
@@ -304,14 +306,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
304 | dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); | 306 | dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); |
305 | 307 | ||
306 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); | 308 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); |
307 | bio->bi_sector = where->sector + (where->count - remaining); | 309 | bio->bi_iter.bi_sector = where->sector + (where->count - remaining); |
308 | bio->bi_bdev = where->bdev; | 310 | bio->bi_bdev = where->bdev; |
309 | bio->bi_end_io = endio; | 311 | bio->bi_end_io = endio; |
310 | store_io_and_region_in_bio(bio, io, region); | 312 | store_io_and_region_in_bio(bio, io, region); |
311 | 313 | ||
312 | if (rw & REQ_DISCARD) { | 314 | if (rw & REQ_DISCARD) { |
313 | num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); | 315 | num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); |
314 | bio->bi_size = num_sectors << SECTOR_SHIFT; | 316 | bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; |
315 | remaining -= num_sectors; | 317 | remaining -= num_sectors; |
316 | } else if (rw & REQ_WRITE_SAME) { | 318 | } else if (rw & REQ_WRITE_SAME) { |
317 | /* | 319 | /* |
@@ -320,7 +322,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
320 | dp->get_page(dp, &page, &len, &offset); | 322 | dp->get_page(dp, &page, &len, &offset); |
321 | bio_add_page(bio, page, logical_block_size, offset); | 323 | bio_add_page(bio, page, logical_block_size, offset); |
322 | num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining); | 324 | num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining); |
323 | bio->bi_size = num_sectors << SECTOR_SHIFT; | 325 | bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; |
324 | 326 | ||
325 | offset = 0; | 327 | offset = 0; |
326 | remaining -= num_sectors; | 328 | remaining -= num_sectors; |
@@ -457,8 +459,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, | |||
457 | list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); | 459 | list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); |
458 | break; | 460 | break; |
459 | 461 | ||
460 | case DM_IO_BVEC: | 462 | case DM_IO_BIO: |
461 | bvec_dp_init(dp, io_req->mem.ptr.bvec); | 463 | bio_dp_init(dp, io_req->mem.ptr.bio); |
462 | break; | 464 | break; |
463 | 465 | ||
464 | case DM_IO_VMA: | 466 | case DM_IO_VMA: |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 4f99d267340c..53e848c10939 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -85,7 +85,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) | |||
85 | 85 | ||
86 | bio->bi_bdev = lc->dev->bdev; | 86 | bio->bi_bdev = lc->dev->bdev; |
87 | if (bio_sectors(bio)) | 87 | if (bio_sectors(bio)) |
88 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | 88 | bio->bi_iter.bi_sector = |
89 | linear_map_sector(ti, bio->bi_iter.bi_sector); | ||
89 | } | 90 | } |
90 | 91 | ||
91 | static int linear_map(struct dm_target *ti, struct bio *bio) | 92 | static int linear_map(struct dm_target *ti, struct bio *bio) |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 9429159d9ee3..b953db6cc229 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c | |||
@@ -10,10 +10,11 @@ | |||
10 | #include <linux/device-mapper.h> | 10 | #include <linux/device-mapper.h> |
11 | #include <linux/dm-log-userspace.h> | 11 | #include <linux/dm-log-userspace.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/workqueue.h> | ||
13 | 14 | ||
14 | #include "dm-log-userspace-transfer.h" | 15 | #include "dm-log-userspace-transfer.h" |
15 | 16 | ||
16 | #define DM_LOG_USERSPACE_VSN "1.1.0" | 17 | #define DM_LOG_USERSPACE_VSN "1.3.0" |
17 | 18 | ||
18 | struct flush_entry { | 19 | struct flush_entry { |
19 | int type; | 20 | int type; |
@@ -58,6 +59,18 @@ struct log_c { | |||
58 | spinlock_t flush_lock; | 59 | spinlock_t flush_lock; |
59 | struct list_head mark_list; | 60 | struct list_head mark_list; |
60 | struct list_head clear_list; | 61 | struct list_head clear_list; |
62 | |||
63 | /* | ||
64 | * Workqueue for flush of clear region requests. | ||
65 | */ | ||
66 | struct workqueue_struct *dmlog_wq; | ||
67 | struct delayed_work flush_log_work; | ||
68 | atomic_t sched_flush; | ||
69 | |||
70 | /* | ||
71 | * Combine userspace flush and mark requests for efficiency. | ||
72 | */ | ||
73 | uint32_t integrated_flush; | ||
61 | }; | 74 | }; |
62 | 75 | ||
63 | static mempool_t *flush_entry_pool; | 76 | static mempool_t *flush_entry_pool; |
@@ -122,6 +135,9 @@ static int build_constructor_string(struct dm_target *ti, | |||
122 | 135 | ||
123 | *ctr_str = NULL; | 136 | *ctr_str = NULL; |
124 | 137 | ||
138 | /* | ||
139 | * Determine overall size of the string. | ||
140 | */ | ||
125 | for (i = 0, str_size = 0; i < argc; i++) | 141 | for (i = 0, str_size = 0; i < argc; i++) |
126 | str_size += strlen(argv[i]) + 1; /* +1 for space between args */ | 142 | str_size += strlen(argv[i]) + 1; /* +1 for space between args */ |
127 | 143 | ||
@@ -141,18 +157,39 @@ static int build_constructor_string(struct dm_target *ti, | |||
141 | return str_size; | 157 | return str_size; |
142 | } | 158 | } |
143 | 159 | ||
160 | static void do_flush(struct work_struct *work) | ||
161 | { | ||
162 | int r; | ||
163 | struct log_c *lc = container_of(work, struct log_c, flush_log_work.work); | ||
164 | |||
165 | atomic_set(&lc->sched_flush, 0); | ||
166 | |||
167 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); | ||
168 | |||
169 | if (r) | ||
170 | dm_table_event(lc->ti->table); | ||
171 | } | ||
172 | |||
144 | /* | 173 | /* |
145 | * userspace_ctr | 174 | * userspace_ctr |
146 | * | 175 | * |
147 | * argv contains: | 176 | * argv contains: |
148 | * <UUID> <other args> | 177 | * <UUID> [integrated_flush] <other args> |
149 | * Where 'other args' is the userspace implementation specific log | 178 | * Where 'other args' are the userspace implementation-specific log |
150 | * arguments. An example might be: | 179 | * arguments. |
151 | * <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync] | 180 | * |
181 | * Example: | ||
182 | * <UUID> [integrated_flush] clustered-disk <arg count> <log dev> | ||
183 | * <region_size> [[no]sync] | ||
184 | * | ||
185 | * This module strips off the <UUID> and uses it for identification | ||
186 | * purposes when communicating with userspace about a log. | ||
152 | * | 187 | * |
153 | * So, this module will strip off the <UUID> for identification purposes | 188 | * If integrated_flush is defined, the kernel combines flush |
154 | * when communicating with userspace about a log; but will pass on everything | 189 | * and mark requests. |
155 | * else. | 190 | * |
191 | * The rest of the line, beginning with 'clustered-disk', is passed | ||
192 | * to the userspace ctr function. | ||
156 | */ | 193 | */ |
157 | static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | 194 | static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, |
158 | unsigned argc, char **argv) | 195 | unsigned argc, char **argv) |
@@ -188,12 +225,22 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
188 | return -EINVAL; | 225 | return -EINVAL; |
189 | } | 226 | } |
190 | 227 | ||
228 | lc->usr_argc = argc; | ||
229 | |||
191 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); | 230 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); |
231 | argc--; | ||
232 | argv++; | ||
192 | spin_lock_init(&lc->flush_lock); | 233 | spin_lock_init(&lc->flush_lock); |
193 | INIT_LIST_HEAD(&lc->mark_list); | 234 | INIT_LIST_HEAD(&lc->mark_list); |
194 | INIT_LIST_HEAD(&lc->clear_list); | 235 | INIT_LIST_HEAD(&lc->clear_list); |
195 | 236 | ||
196 | str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); | 237 | if (!strcasecmp(argv[0], "integrated_flush")) { |
238 | lc->integrated_flush = 1; | ||
239 | argc--; | ||
240 | argv++; | ||
241 | } | ||
242 | |||
243 | str_size = build_constructor_string(ti, argc, argv, &ctr_str); | ||
197 | if (str_size < 0) { | 244 | if (str_size < 0) { |
198 | kfree(lc); | 245 | kfree(lc); |
199 | return str_size; | 246 | return str_size; |
@@ -246,6 +293,19 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
246 | DMERR("Failed to register %s with device-mapper", | 293 | DMERR("Failed to register %s with device-mapper", |
247 | devices_rdata); | 294 | devices_rdata); |
248 | } | 295 | } |
296 | |||
297 | if (lc->integrated_flush) { | ||
298 | lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0); | ||
299 | if (!lc->dmlog_wq) { | ||
300 | DMERR("couldn't start dmlogd"); | ||
301 | r = -ENOMEM; | ||
302 | goto out; | ||
303 | } | ||
304 | |||
305 | INIT_DELAYED_WORK(&lc->flush_log_work, do_flush); | ||
306 | atomic_set(&lc->sched_flush, 0); | ||
307 | } | ||
308 | |||
249 | out: | 309 | out: |
250 | kfree(devices_rdata); | 310 | kfree(devices_rdata); |
251 | if (r) { | 311 | if (r) { |
@@ -253,7 +313,6 @@ out: | |||
253 | kfree(ctr_str); | 313 | kfree(ctr_str); |
254 | } else { | 314 | } else { |
255 | lc->usr_argv_str = ctr_str; | 315 | lc->usr_argv_str = ctr_str; |
256 | lc->usr_argc = argc; | ||
257 | log->context = lc; | 316 | log->context = lc; |
258 | } | 317 | } |
259 | 318 | ||
@@ -264,9 +323,16 @@ static void userspace_dtr(struct dm_dirty_log *log) | |||
264 | { | 323 | { |
265 | struct log_c *lc = log->context; | 324 | struct log_c *lc = log->context; |
266 | 325 | ||
326 | if (lc->integrated_flush) { | ||
327 | /* flush workqueue */ | ||
328 | if (atomic_read(&lc->sched_flush)) | ||
329 | flush_delayed_work(&lc->flush_log_work); | ||
330 | |||
331 | destroy_workqueue(lc->dmlog_wq); | ||
332 | } | ||
333 | |||
267 | (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, | 334 | (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, |
268 | NULL, 0, | 335 | NULL, 0, NULL, NULL); |
269 | NULL, NULL); | ||
270 | 336 | ||
271 | if (lc->log_dev) | 337 | if (lc->log_dev) |
272 | dm_put_device(lc->ti, lc->log_dev); | 338 | dm_put_device(lc->ti, lc->log_dev); |
@@ -283,8 +349,7 @@ static int userspace_presuspend(struct dm_dirty_log *log) | |||
283 | struct log_c *lc = log->context; | 349 | struct log_c *lc = log->context; |
284 | 350 | ||
285 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, | 351 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, |
286 | NULL, 0, | 352 | NULL, 0, NULL, NULL); |
287 | NULL, NULL); | ||
288 | 353 | ||
289 | return r; | 354 | return r; |
290 | } | 355 | } |
@@ -294,9 +359,14 @@ static int userspace_postsuspend(struct dm_dirty_log *log) | |||
294 | int r; | 359 | int r; |
295 | struct log_c *lc = log->context; | 360 | struct log_c *lc = log->context; |
296 | 361 | ||
362 | /* | ||
363 | * Run planned flush earlier. | ||
364 | */ | ||
365 | if (lc->integrated_flush && atomic_read(&lc->sched_flush)) | ||
366 | flush_delayed_work(&lc->flush_log_work); | ||
367 | |||
297 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, | 368 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, |
298 | NULL, 0, | 369 | NULL, 0, NULL, NULL); |
299 | NULL, NULL); | ||
300 | 370 | ||
301 | return r; | 371 | return r; |
302 | } | 372 | } |
@@ -308,8 +378,7 @@ static int userspace_resume(struct dm_dirty_log *log) | |||
308 | 378 | ||
309 | lc->in_sync_hint = 0; | 379 | lc->in_sync_hint = 0; |
310 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, | 380 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, |
311 | NULL, 0, | 381 | NULL, 0, NULL, NULL); |
312 | NULL, NULL); | ||
313 | 382 | ||
314 | return r; | 383 | return r; |
315 | } | 384 | } |
@@ -405,7 +474,8 @@ static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) | |||
405 | return r; | 474 | return r; |
406 | } | 475 | } |
407 | 476 | ||
408 | static int flush_by_group(struct log_c *lc, struct list_head *flush_list) | 477 | static int flush_by_group(struct log_c *lc, struct list_head *flush_list, |
478 | int flush_with_payload) | ||
409 | { | 479 | { |
410 | int r = 0; | 480 | int r = 0; |
411 | int count; | 481 | int count; |
@@ -431,15 +501,29 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list) | |||
431 | break; | 501 | break; |
432 | } | 502 | } |
433 | 503 | ||
434 | r = userspace_do_request(lc, lc->uuid, type, | 504 | if (flush_with_payload) { |
435 | (char *)(group), | 505 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, |
436 | count * sizeof(uint64_t), | 506 | (char *)(group), |
437 | NULL, NULL); | 507 | count * sizeof(uint64_t), |
438 | if (r) { | 508 | NULL, NULL); |
439 | /* Group send failed. Attempt one-by-one. */ | 509 | /* |
440 | list_splice_init(&tmp_list, flush_list); | 510 | * Integrated flush failed. |
441 | r = flush_one_by_one(lc, flush_list); | 511 | */ |
442 | break; | 512 | if (r) |
513 | break; | ||
514 | } else { | ||
515 | r = userspace_do_request(lc, lc->uuid, type, | ||
516 | (char *)(group), | ||
517 | count * sizeof(uint64_t), | ||
518 | NULL, NULL); | ||
519 | if (r) { | ||
520 | /* | ||
521 | * Group send failed. Attempt one-by-one. | ||
522 | */ | ||
523 | list_splice_init(&tmp_list, flush_list); | ||
524 | r = flush_one_by_one(lc, flush_list); | ||
525 | break; | ||
526 | } | ||
443 | } | 527 | } |
444 | } | 528 | } |
445 | 529 | ||
@@ -476,6 +560,8 @@ static int userspace_flush(struct dm_dirty_log *log) | |||
476 | struct log_c *lc = log->context; | 560 | struct log_c *lc = log->context; |
477 | LIST_HEAD(mark_list); | 561 | LIST_HEAD(mark_list); |
478 | LIST_HEAD(clear_list); | 562 | LIST_HEAD(clear_list); |
563 | int mark_list_is_empty; | ||
564 | int clear_list_is_empty; | ||
479 | struct flush_entry *fe, *tmp_fe; | 565 | struct flush_entry *fe, *tmp_fe; |
480 | 566 | ||
481 | spin_lock_irqsave(&lc->flush_lock, flags); | 567 | spin_lock_irqsave(&lc->flush_lock, flags); |
@@ -483,23 +569,51 @@ static int userspace_flush(struct dm_dirty_log *log) | |||
483 | list_splice_init(&lc->clear_list, &clear_list); | 569 | list_splice_init(&lc->clear_list, &clear_list); |
484 | spin_unlock_irqrestore(&lc->flush_lock, flags); | 570 | spin_unlock_irqrestore(&lc->flush_lock, flags); |
485 | 571 | ||
486 | if (list_empty(&mark_list) && list_empty(&clear_list)) | 572 | mark_list_is_empty = list_empty(&mark_list); |
573 | clear_list_is_empty = list_empty(&clear_list); | ||
574 | |||
575 | if (mark_list_is_empty && clear_list_is_empty) | ||
487 | return 0; | 576 | return 0; |
488 | 577 | ||
489 | r = flush_by_group(lc, &mark_list); | 578 | r = flush_by_group(lc, &clear_list, 0); |
490 | if (r) | 579 | if (r) |
491 | goto fail; | 580 | goto out; |
492 | 581 | ||
493 | r = flush_by_group(lc, &clear_list); | 582 | if (!lc->integrated_flush) { |
583 | r = flush_by_group(lc, &mark_list, 0); | ||
584 | if (r) | ||
585 | goto out; | ||
586 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | ||
587 | NULL, 0, NULL, NULL); | ||
588 | goto out; | ||
589 | } | ||
590 | |||
591 | /* | ||
592 | * Send integrated flush request with mark_list as payload. | ||
593 | */ | ||
594 | r = flush_by_group(lc, &mark_list, 1); | ||
494 | if (r) | 595 | if (r) |
495 | goto fail; | 596 | goto out; |
496 | 597 | ||
497 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | 598 | if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) { |
498 | NULL, 0, NULL, NULL); | 599 | /* |
600 | * When there are only clear region requests, | ||
601 | * we schedule a flush in the future. | ||
602 | */ | ||
603 | queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ); | ||
604 | atomic_set(&lc->sched_flush, 1); | ||
605 | } else { | ||
606 | /* | ||
607 | * Cancel pending flush because we | ||
608 | * have already flushed in mark_region. | ||
609 | */ | ||
610 | cancel_delayed_work(&lc->flush_log_work); | ||
611 | atomic_set(&lc->sched_flush, 0); | ||
612 | } | ||
499 | 613 | ||
500 | fail: | 614 | out: |
501 | /* | 615 | /* |
502 | * We can safely remove these entries, even if failure. | 616 | * We can safely remove these entries, even after failure. |
503 | * Calling code will receive an error and will know that | 617 | * Calling code will receive an error and will know that |
504 | * the log facility has failed. | 618 | * the log facility has failed. |
505 | */ | 619 | */ |
@@ -603,8 +717,7 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) | |||
603 | 717 | ||
604 | rdata_size = sizeof(pkg); | 718 | rdata_size = sizeof(pkg); |
605 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, | 719 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, |
606 | NULL, 0, | 720 | NULL, 0, (char *)&pkg, &rdata_size); |
607 | (char *)&pkg, &rdata_size); | ||
608 | 721 | ||
609 | *region = pkg.r; | 722 | *region = pkg.r; |
610 | return (r) ? r : (int)pkg.i; | 723 | return (r) ? r : (int)pkg.i; |
@@ -630,8 +743,7 @@ static void userspace_set_region_sync(struct dm_dirty_log *log, | |||
630 | pkg.i = (int64_t)in_sync; | 743 | pkg.i = (int64_t)in_sync; |
631 | 744 | ||
632 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, | 745 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, |
633 | (char *)&pkg, sizeof(pkg), | 746 | (char *)&pkg, sizeof(pkg), NULL, NULL); |
634 | NULL, NULL); | ||
635 | 747 | ||
636 | /* | 748 | /* |
637 | * It would be nice to be able to report failures. | 749 | * It would be nice to be able to report failures. |
@@ -657,8 +769,7 @@ static region_t userspace_get_sync_count(struct dm_dirty_log *log) | |||
657 | 769 | ||
658 | rdata_size = sizeof(sync_count); | 770 | rdata_size = sizeof(sync_count); |
659 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, | 771 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, |
660 | NULL, 0, | 772 | NULL, 0, (char *)&sync_count, &rdata_size); |
661 | (char *)&sync_count, &rdata_size); | ||
662 | 773 | ||
663 | if (r) | 774 | if (r) |
664 | return 0; | 775 | return 0; |
@@ -685,8 +796,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | |||
685 | switch (status_type) { | 796 | switch (status_type) { |
686 | case STATUSTYPE_INFO: | 797 | case STATUSTYPE_INFO: |
687 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, | 798 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, |
688 | NULL, 0, | 799 | NULL, 0, result, &sz); |
689 | result, &sz); | ||
690 | 800 | ||
691 | if (r) { | 801 | if (r) { |
692 | sz = 0; | 802 | sz = 0; |
@@ -699,8 +809,10 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | |||
699 | BUG_ON(!table_args); /* There will always be a ' ' */ | 809 | BUG_ON(!table_args); /* There will always be a ' ' */ |
700 | table_args++; | 810 | table_args++; |
701 | 811 | ||
702 | DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, | 812 | DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid); |
703 | lc->uuid, table_args); | 813 | if (lc->integrated_flush) |
814 | DMEMIT("integrated_flush "); | ||
815 | DMEMIT("%s ", table_args); | ||
704 | break; | 816 | break; |
705 | } | 817 | } |
706 | return (r) ? 0 : (int)sz; | 818 | return (r) ? 0 : (int)sz; |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6eb9dc9ef8f3..422a9fdeb53e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
1626 | /* | 1626 | /* |
1627 | * Only pass ioctls through if the device sizes match exactly. | 1627 | * Only pass ioctls through if the device sizes match exactly. |
1628 | */ | 1628 | */ |
1629 | if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) | 1629 | if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) { |
1630 | r = scsi_verify_blk_ioctl(NULL, cmd); | 1630 | int err = scsi_verify_blk_ioctl(NULL, cmd); |
1631 | if (err) | ||
1632 | r = err; | ||
1633 | } | ||
1631 | 1634 | ||
1632 | if (r == -ENOTCONN && !fatal_signal_pending(current)) | 1635 | if (r == -ENOTCONN && !fatal_signal_pending(current)) |
1633 | queue_work(kmultipathd, &m->process_queued_ios); | 1636 | queue_work(kmultipathd, &m->process_queued_ios); |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9584443c5614..7dfdb5c746d6 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -432,7 +432,7 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio) | |||
432 | region_t region = dm_rh_bio_to_region(ms->rh, bio); | 432 | region_t region = dm_rh_bio_to_region(ms->rh, bio); |
433 | 433 | ||
434 | if (log->type->in_sync(log, region, 0)) | 434 | if (log->type->in_sync(log, region, 0)) |
435 | return choose_mirror(ms, bio->bi_sector) ? 1 : 0; | 435 | return choose_mirror(ms, bio->bi_iter.bi_sector) ? 1 : 0; |
436 | 436 | ||
437 | return 0; | 437 | return 0; |
438 | } | 438 | } |
@@ -442,15 +442,15 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio) | |||
442 | */ | 442 | */ |
443 | static sector_t map_sector(struct mirror *m, struct bio *bio) | 443 | static sector_t map_sector(struct mirror *m, struct bio *bio) |
444 | { | 444 | { |
445 | if (unlikely(!bio->bi_size)) | 445 | if (unlikely(!bio->bi_iter.bi_size)) |
446 | return 0; | 446 | return 0; |
447 | return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector); | 447 | return m->offset + dm_target_offset(m->ms->ti, bio->bi_iter.bi_sector); |
448 | } | 448 | } |
449 | 449 | ||
450 | static void map_bio(struct mirror *m, struct bio *bio) | 450 | static void map_bio(struct mirror *m, struct bio *bio) |
451 | { | 451 | { |
452 | bio->bi_bdev = m->dev->bdev; | 452 | bio->bi_bdev = m->dev->bdev; |
453 | bio->bi_sector = map_sector(m, bio); | 453 | bio->bi_iter.bi_sector = map_sector(m, bio); |
454 | } | 454 | } |
455 | 455 | ||
456 | static void map_region(struct dm_io_region *io, struct mirror *m, | 456 | static void map_region(struct dm_io_region *io, struct mirror *m, |
@@ -526,8 +526,8 @@ static void read_async_bio(struct mirror *m, struct bio *bio) | |||
526 | struct dm_io_region io; | 526 | struct dm_io_region io; |
527 | struct dm_io_request io_req = { | 527 | struct dm_io_request io_req = { |
528 | .bi_rw = READ, | 528 | .bi_rw = READ, |
529 | .mem.type = DM_IO_BVEC, | 529 | .mem.type = DM_IO_BIO, |
530 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 530 | .mem.ptr.bio = bio, |
531 | .notify.fn = read_callback, | 531 | .notify.fn = read_callback, |
532 | .notify.context = bio, | 532 | .notify.context = bio, |
533 | .client = m->ms->io_client, | 533 | .client = m->ms->io_client, |
@@ -559,7 +559,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) | |||
559 | * We can only read balance if the region is in sync. | 559 | * We can only read balance if the region is in sync. |
560 | */ | 560 | */ |
561 | if (likely(region_in_sync(ms, region, 1))) | 561 | if (likely(region_in_sync(ms, region, 1))) |
562 | m = choose_mirror(ms, bio->bi_sector); | 562 | m = choose_mirror(ms, bio->bi_iter.bi_sector); |
563 | else if (m && atomic_read(&m->error_count)) | 563 | else if (m && atomic_read(&m->error_count)) |
564 | m = NULL; | 564 | m = NULL; |
565 | 565 | ||
@@ -629,8 +629,8 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
629 | struct mirror *m; | 629 | struct mirror *m; |
630 | struct dm_io_request io_req = { | 630 | struct dm_io_request io_req = { |
631 | .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), | 631 | .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), |
632 | .mem.type = DM_IO_BVEC, | 632 | .mem.type = DM_IO_BIO, |
633 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 633 | .mem.ptr.bio = bio, |
634 | .notify.fn = write_callback, | 634 | .notify.fn = write_callback, |
635 | .notify.context = bio, | 635 | .notify.context = bio, |
636 | .client = ms->io_client, | 636 | .client = ms->io_client, |
@@ -1181,7 +1181,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) | |||
1181 | * The region is in-sync and we can perform reads directly. | 1181 | * The region is in-sync and we can perform reads directly. |
1182 | * Store enough information so we can retry if it fails. | 1182 | * Store enough information so we can retry if it fails. |
1183 | */ | 1183 | */ |
1184 | m = choose_mirror(ms, bio->bi_sector); | 1184 | m = choose_mirror(ms, bio->bi_iter.bi_sector); |
1185 | if (unlikely(!m)) | 1185 | if (unlikely(!m)) |
1186 | return -EIO; | 1186 | return -EIO; |
1187 | 1187 | ||
@@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) | |||
1244 | 1244 | ||
1245 | dm_bio_restore(bd, bio); | 1245 | dm_bio_restore(bd, bio); |
1246 | bio_record->details.bi_bdev = NULL; | 1246 | bio_record->details.bi_bdev = NULL; |
1247 | |||
1248 | atomic_inc(&bio->bi_remaining); | ||
1249 | |||
1247 | queue_bio(ms, bio, rw); | 1250 | queue_bio(ms, bio, rw); |
1248 | return DM_ENDIO_INCOMPLETE; | 1251 | return DM_ENDIO_INCOMPLETE; |
1249 | } | 1252 | } |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 69732e03eb34..b929fd5f4984 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
@@ -126,7 +126,8 @@ EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); | |||
126 | 126 | ||
127 | region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) | 127 | region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) |
128 | { | 128 | { |
129 | return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); | 129 | return dm_rh_sector_to_region(rh, bio->bi_iter.bi_sector - |
130 | rh->target_begin); | ||
130 | } | 131 | } |
131 | EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); | 132 | EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); |
132 | 133 | ||
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 2d2b1b7588d7..d6e88178d22c 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -13,10 +13,13 @@ | |||
13 | #include <linux/export.h> | 13 | #include <linux/export.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/dm-io.h> | 15 | #include <linux/dm-io.h> |
16 | #include "dm-bufio.h" | ||
16 | 17 | ||
17 | #define DM_MSG_PREFIX "persistent snapshot" | 18 | #define DM_MSG_PREFIX "persistent snapshot" |
18 | #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ | 19 | #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ |
19 | 20 | ||
21 | #define DM_PREFETCH_CHUNKS 12 | ||
22 | |||
20 | /*----------------------------------------------------------------- | 23 | /*----------------------------------------------------------------- |
21 | * Persistent snapshots, by persistent we mean that the snapshot | 24 | * Persistent snapshots, by persistent we mean that the snapshot |
22 | * will survive a reboot. | 25 | * will survive a reboot. |
@@ -257,6 +260,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, | |||
257 | INIT_WORK_ONSTACK(&req.work, do_metadata); | 260 | INIT_WORK_ONSTACK(&req.work, do_metadata); |
258 | queue_work(ps->metadata_wq, &req.work); | 261 | queue_work(ps->metadata_wq, &req.work); |
259 | flush_workqueue(ps->metadata_wq); | 262 | flush_workqueue(ps->metadata_wq); |
263 | destroy_work_on_stack(&req.work); | ||
260 | 264 | ||
261 | return req.result; | 265 | return req.result; |
262 | } | 266 | } |
@@ -401,17 +405,18 @@ static int write_header(struct pstore *ps) | |||
401 | /* | 405 | /* |
402 | * Access functions for the disk exceptions, these do the endian conversions. | 406 | * Access functions for the disk exceptions, these do the endian conversions. |
403 | */ | 407 | */ |
404 | static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) | 408 | static struct disk_exception *get_exception(struct pstore *ps, void *ps_area, |
409 | uint32_t index) | ||
405 | { | 410 | { |
406 | BUG_ON(index >= ps->exceptions_per_area); | 411 | BUG_ON(index >= ps->exceptions_per_area); |
407 | 412 | ||
408 | return ((struct disk_exception *) ps->area) + index; | 413 | return ((struct disk_exception *) ps_area) + index; |
409 | } | 414 | } |
410 | 415 | ||
411 | static void read_exception(struct pstore *ps, | 416 | static void read_exception(struct pstore *ps, void *ps_area, |
412 | uint32_t index, struct core_exception *result) | 417 | uint32_t index, struct core_exception *result) |
413 | { | 418 | { |
414 | struct disk_exception *de = get_exception(ps, index); | 419 | struct disk_exception *de = get_exception(ps, ps_area, index); |
415 | 420 | ||
416 | /* copy it */ | 421 | /* copy it */ |
417 | result->old_chunk = le64_to_cpu(de->old_chunk); | 422 | result->old_chunk = le64_to_cpu(de->old_chunk); |
@@ -421,7 +426,7 @@ static void read_exception(struct pstore *ps, | |||
421 | static void write_exception(struct pstore *ps, | 426 | static void write_exception(struct pstore *ps, |
422 | uint32_t index, struct core_exception *e) | 427 | uint32_t index, struct core_exception *e) |
423 | { | 428 | { |
424 | struct disk_exception *de = get_exception(ps, index); | 429 | struct disk_exception *de = get_exception(ps, ps->area, index); |
425 | 430 | ||
426 | /* copy it */ | 431 | /* copy it */ |
427 | de->old_chunk = cpu_to_le64(e->old_chunk); | 432 | de->old_chunk = cpu_to_le64(e->old_chunk); |
@@ -430,7 +435,7 @@ static void write_exception(struct pstore *ps, | |||
430 | 435 | ||
431 | static void clear_exception(struct pstore *ps, uint32_t index) | 436 | static void clear_exception(struct pstore *ps, uint32_t index) |
432 | { | 437 | { |
433 | struct disk_exception *de = get_exception(ps, index); | 438 | struct disk_exception *de = get_exception(ps, ps->area, index); |
434 | 439 | ||
435 | /* clear it */ | 440 | /* clear it */ |
436 | de->old_chunk = 0; | 441 | de->old_chunk = 0; |
@@ -442,7 +447,7 @@ static void clear_exception(struct pstore *ps, uint32_t index) | |||
442 | * 'full' is filled in to indicate if the area has been | 447 | * 'full' is filled in to indicate if the area has been |
443 | * filled. | 448 | * filled. |
444 | */ | 449 | */ |
445 | static int insert_exceptions(struct pstore *ps, | 450 | static int insert_exceptions(struct pstore *ps, void *ps_area, |
446 | int (*callback)(void *callback_context, | 451 | int (*callback)(void *callback_context, |
447 | chunk_t old, chunk_t new), | 452 | chunk_t old, chunk_t new), |
448 | void *callback_context, | 453 | void *callback_context, |
@@ -456,7 +461,7 @@ static int insert_exceptions(struct pstore *ps, | |||
456 | *full = 1; | 461 | *full = 1; |
457 | 462 | ||
458 | for (i = 0; i < ps->exceptions_per_area; i++) { | 463 | for (i = 0; i < ps->exceptions_per_area; i++) { |
459 | read_exception(ps, i, &e); | 464 | read_exception(ps, ps_area, i, &e); |
460 | 465 | ||
461 | /* | 466 | /* |
462 | * If the new_chunk is pointing at the start of | 467 | * If the new_chunk is pointing at the start of |
@@ -493,26 +498,75 @@ static int read_exceptions(struct pstore *ps, | |||
493 | void *callback_context) | 498 | void *callback_context) |
494 | { | 499 | { |
495 | int r, full = 1; | 500 | int r, full = 1; |
501 | struct dm_bufio_client *client; | ||
502 | chunk_t prefetch_area = 0; | ||
503 | |||
504 | client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev, | ||
505 | ps->store->chunk_size << SECTOR_SHIFT, | ||
506 | 1, 0, NULL, NULL); | ||
507 | |||
508 | if (IS_ERR(client)) | ||
509 | return PTR_ERR(client); | ||
510 | |||
511 | /* | ||
512 | * Setup for one current buffer + desired readahead buffers. | ||
513 | */ | ||
514 | dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS); | ||
496 | 515 | ||
497 | /* | 516 | /* |
498 | * Keeping reading chunks and inserting exceptions until | 517 | * Keeping reading chunks and inserting exceptions until |
499 | * we find a partially full area. | 518 | * we find a partially full area. |
500 | */ | 519 | */ |
501 | for (ps->current_area = 0; full; ps->current_area++) { | 520 | for (ps->current_area = 0; full; ps->current_area++) { |
502 | r = area_io(ps, READ); | 521 | struct dm_buffer *bp; |
503 | if (r) | 522 | void *area; |
504 | return r; | 523 | chunk_t chunk; |
524 | |||
525 | if (unlikely(prefetch_area < ps->current_area)) | ||
526 | prefetch_area = ps->current_area; | ||
527 | |||
528 | if (DM_PREFETCH_CHUNKS) do { | ||
529 | chunk_t pf_chunk = area_location(ps, prefetch_area); | ||
530 | if (unlikely(pf_chunk >= dm_bufio_get_device_size(client))) | ||
531 | break; | ||
532 | dm_bufio_prefetch(client, pf_chunk, 1); | ||
533 | prefetch_area++; | ||
534 | if (unlikely(!prefetch_area)) | ||
535 | break; | ||
536 | } while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS); | ||
537 | |||
538 | chunk = area_location(ps, ps->current_area); | ||
539 | |||
540 | area = dm_bufio_read(client, chunk, &bp); | ||
541 | if (unlikely(IS_ERR(area))) { | ||
542 | r = PTR_ERR(area); | ||
543 | goto ret_destroy_bufio; | ||
544 | } | ||
505 | 545 | ||
506 | r = insert_exceptions(ps, callback, callback_context, &full); | 546 | r = insert_exceptions(ps, area, callback, callback_context, |
507 | if (r) | 547 | &full); |
508 | return r; | 548 | |
549 | if (!full) | ||
550 | memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT); | ||
551 | |||
552 | dm_bufio_release(bp); | ||
553 | |||
554 | dm_bufio_forget(client, chunk); | ||
555 | |||
556 | if (unlikely(r)) | ||
557 | goto ret_destroy_bufio; | ||
509 | } | 558 | } |
510 | 559 | ||
511 | ps->current_area--; | 560 | ps->current_area--; |
512 | 561 | ||
513 | skip_metadata(ps); | 562 | skip_metadata(ps); |
514 | 563 | ||
515 | return 0; | 564 | r = 0; |
565 | |||
566 | ret_destroy_bufio: | ||
567 | dm_bufio_client_destroy(client); | ||
568 | |||
569 | return r; | ||
516 | } | 570 | } |
517 | 571 | ||
518 | static struct pstore *get_info(struct dm_exception_store *store) | 572 | static struct pstore *get_info(struct dm_exception_store *store) |
@@ -733,7 +787,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store, | |||
733 | ps->current_committed = ps->exceptions_per_area; | 787 | ps->current_committed = ps->exceptions_per_area; |
734 | } | 788 | } |
735 | 789 | ||
736 | read_exception(ps, ps->current_committed - 1, &ce); | 790 | read_exception(ps, ps->area, ps->current_committed - 1, &ce); |
737 | *last_old_chunk = ce.old_chunk; | 791 | *last_old_chunk = ce.old_chunk; |
738 | *last_new_chunk = ce.new_chunk; | 792 | *last_new_chunk = ce.new_chunk; |
739 | 793 | ||
@@ -743,8 +797,8 @@ static int persistent_prepare_merge(struct dm_exception_store *store, | |||
743 | */ | 797 | */ |
744 | for (nr_consecutive = 1; nr_consecutive < ps->current_committed; | 798 | for (nr_consecutive = 1; nr_consecutive < ps->current_committed; |
745 | nr_consecutive++) { | 799 | nr_consecutive++) { |
746 | read_exception(ps, ps->current_committed - 1 - nr_consecutive, | 800 | read_exception(ps, ps->area, |
747 | &ce); | 801 | ps->current_committed - 1 - nr_consecutive, &ce); |
748 | if (ce.old_chunk != *last_old_chunk - nr_consecutive || | 802 | if (ce.old_chunk != *last_old_chunk - nr_consecutive || |
749 | ce.new_chunk != *last_new_chunk - nr_consecutive) | 803 | ce.new_chunk != *last_new_chunk - nr_consecutive) |
750 | break; | 804 | break; |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 944690bafd93..ebddef5237e4 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -610,12 +610,12 @@ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, | |||
610 | return NULL; | 610 | return NULL; |
611 | } | 611 | } |
612 | 612 | ||
613 | static struct dm_exception *alloc_completed_exception(void) | 613 | static struct dm_exception *alloc_completed_exception(gfp_t gfp) |
614 | { | 614 | { |
615 | struct dm_exception *e; | 615 | struct dm_exception *e; |
616 | 616 | ||
617 | e = kmem_cache_alloc(exception_cache, GFP_NOIO); | 617 | e = kmem_cache_alloc(exception_cache, gfp); |
618 | if (!e) | 618 | if (!e && gfp == GFP_NOIO) |
619 | e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); | 619 | e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); |
620 | 620 | ||
621 | return e; | 621 | return e; |
@@ -697,7 +697,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) | |||
697 | struct dm_snapshot *s = context; | 697 | struct dm_snapshot *s = context; |
698 | struct dm_exception *e; | 698 | struct dm_exception *e; |
699 | 699 | ||
700 | e = alloc_completed_exception(); | 700 | e = alloc_completed_exception(GFP_KERNEL); |
701 | if (!e) | 701 | if (!e) |
702 | return -ENOMEM; | 702 | return -ENOMEM; |
703 | 703 | ||
@@ -1405,7 +1405,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
1405 | goto out; | 1405 | goto out; |
1406 | } | 1406 | } |
1407 | 1407 | ||
1408 | e = alloc_completed_exception(); | 1408 | e = alloc_completed_exception(GFP_NOIO); |
1409 | if (!e) { | 1409 | if (!e) { |
1410 | down_write(&s->lock); | 1410 | down_write(&s->lock); |
1411 | __invalidate_snapshot(s, -ENOMEM); | 1411 | __invalidate_snapshot(s, -ENOMEM); |
@@ -1438,6 +1438,7 @@ out: | |||
1438 | if (full_bio) { | 1438 | if (full_bio) { |
1439 | full_bio->bi_end_io = pe->full_bio_end_io; | 1439 | full_bio->bi_end_io = pe->full_bio_end_io; |
1440 | full_bio->bi_private = pe->full_bio_private; | 1440 | full_bio->bi_private = pe->full_bio_private; |
1441 | atomic_inc(&full_bio->bi_remaining); | ||
1441 | } | 1442 | } |
1442 | free_pending_exception(pe); | 1443 | free_pending_exception(pe); |
1443 | 1444 | ||
@@ -1619,11 +1620,10 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, | |||
1619 | struct bio *bio, chunk_t chunk) | 1620 | struct bio *bio, chunk_t chunk) |
1620 | { | 1621 | { |
1621 | bio->bi_bdev = s->cow->bdev; | 1622 | bio->bi_bdev = s->cow->bdev; |
1622 | bio->bi_sector = chunk_to_sector(s->store, | 1623 | bio->bi_iter.bi_sector = |
1623 | dm_chunk_number(e->new_chunk) + | 1624 | chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + |
1624 | (chunk - e->old_chunk)) + | 1625 | (chunk - e->old_chunk)) + |
1625 | (bio->bi_sector & | 1626 | (bio->bi_iter.bi_sector & s->store->chunk_mask); |
1626 | s->store->chunk_mask); | ||
1627 | } | 1627 | } |
1628 | 1628 | ||
1629 | static int snapshot_map(struct dm_target *ti, struct bio *bio) | 1629 | static int snapshot_map(struct dm_target *ti, struct bio *bio) |
@@ -1641,7 +1641,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) | |||
1641 | return DM_MAPIO_REMAPPED; | 1641 | return DM_MAPIO_REMAPPED; |
1642 | } | 1642 | } |
1643 | 1643 | ||
1644 | chunk = sector_to_chunk(s->store, bio->bi_sector); | 1644 | chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); |
1645 | 1645 | ||
1646 | /* Full snapshots are not usable */ | 1646 | /* Full snapshots are not usable */ |
1647 | /* To get here the table must be live so s->active is always set. */ | 1647 | /* To get here the table must be live so s->active is always set. */ |
@@ -1702,7 +1702,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) | |||
1702 | r = DM_MAPIO_SUBMITTED; | 1702 | r = DM_MAPIO_SUBMITTED; |
1703 | 1703 | ||
1704 | if (!pe->started && | 1704 | if (!pe->started && |
1705 | bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { | 1705 | bio->bi_iter.bi_size == |
1706 | (s->store->chunk_size << SECTOR_SHIFT)) { | ||
1706 | pe->started = 1; | 1707 | pe->started = 1; |
1707 | up_write(&s->lock); | 1708 | up_write(&s->lock); |
1708 | start_full_bio(pe, bio); | 1709 | start_full_bio(pe, bio); |
@@ -1758,7 +1759,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) | |||
1758 | return DM_MAPIO_REMAPPED; | 1759 | return DM_MAPIO_REMAPPED; |
1759 | } | 1760 | } |
1760 | 1761 | ||
1761 | chunk = sector_to_chunk(s->store, bio->bi_sector); | 1762 | chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); |
1762 | 1763 | ||
1763 | down_write(&s->lock); | 1764 | down_write(&s->lock); |
1764 | 1765 | ||
@@ -2095,7 +2096,7 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) | |||
2095 | down_read(&_origins_lock); | 2096 | down_read(&_origins_lock); |
2096 | o = __lookup_origin(origin->bdev); | 2097 | o = __lookup_origin(origin->bdev); |
2097 | if (o) | 2098 | if (o) |
2098 | r = __origin_write(&o->snapshots, bio->bi_sector, bio); | 2099 | r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); |
2099 | up_read(&_origins_lock); | 2100 | up_read(&_origins_lock); |
2100 | 2101 | ||
2101 | return r; | 2102 | return r; |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 73c1712dad96..d1600d2aa2e2 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -259,13 +259,15 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio, | |||
259 | { | 259 | { |
260 | sector_t begin, end; | 260 | sector_t begin, end; |
261 | 261 | ||
262 | stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); | 262 | stripe_map_range_sector(sc, bio->bi_iter.bi_sector, |
263 | target_stripe, &begin); | ||
263 | stripe_map_range_sector(sc, bio_end_sector(bio), | 264 | stripe_map_range_sector(sc, bio_end_sector(bio), |
264 | target_stripe, &end); | 265 | target_stripe, &end); |
265 | if (begin < end) { | 266 | if (begin < end) { |
266 | bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; | 267 | bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; |
267 | bio->bi_sector = begin + sc->stripe[target_stripe].physical_start; | 268 | bio->bi_iter.bi_sector = begin + |
268 | bio->bi_size = to_bytes(end - begin); | 269 | sc->stripe[target_stripe].physical_start; |
270 | bio->bi_iter.bi_size = to_bytes(end - begin); | ||
269 | return DM_MAPIO_REMAPPED; | 271 | return DM_MAPIO_REMAPPED; |
270 | } else { | 272 | } else { |
271 | /* The range doesn't map to the target stripe */ | 273 | /* The range doesn't map to the target stripe */ |
@@ -293,9 +295,10 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) | |||
293 | return stripe_map_range(sc, bio, target_bio_nr); | 295 | return stripe_map_range(sc, bio, target_bio_nr); |
294 | } | 296 | } |
295 | 297 | ||
296 | stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); | 298 | stripe_map_sector(sc, bio->bi_iter.bi_sector, |
299 | &stripe, &bio->bi_iter.bi_sector); | ||
297 | 300 | ||
298 | bio->bi_sector += sc->stripe[stripe].physical_start; | 301 | bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start; |
299 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; | 302 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; |
300 | 303 | ||
301 | return DM_MAPIO_REMAPPED; | 304 | return DM_MAPIO_REMAPPED; |
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index ff9ac4be4721..09a688b3d48c 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c | |||
@@ -311,11 +311,11 @@ error: | |||
311 | static int switch_map(struct dm_target *ti, struct bio *bio) | 311 | static int switch_map(struct dm_target *ti, struct bio *bio) |
312 | { | 312 | { |
313 | struct switch_ctx *sctx = ti->private; | 313 | struct switch_ctx *sctx = ti->private; |
314 | sector_t offset = dm_target_offset(ti, bio->bi_sector); | 314 | sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); |
315 | unsigned path_nr = switch_get_path_nr(sctx, offset); | 315 | unsigned path_nr = switch_get_path_nr(sctx, offset); |
316 | 316 | ||
317 | bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; | 317 | bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; |
318 | bio->bi_sector = sctx->path_list[path_nr].start + offset; | 318 | bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; |
319 | 319 | ||
320 | return DM_MAPIO_REMAPPED; | 320 | return DM_MAPIO_REMAPPED; |
321 | } | 321 | } |
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index 84d2b91e4efb..c62c5ab6aed5 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c | |||
@@ -86,6 +86,7 @@ static const struct sysfs_ops dm_sysfs_ops = { | |||
86 | static struct kobj_type dm_ktype = { | 86 | static struct kobj_type dm_ktype = { |
87 | .sysfs_ops = &dm_sysfs_ops, | 87 | .sysfs_ops = &dm_sysfs_ops, |
88 | .default_attrs = dm_attrs, | 88 | .default_attrs = dm_attrs, |
89 | .release = dm_kobject_release, | ||
89 | }; | 90 | }; |
90 | 91 | ||
91 | /* | 92 | /* |
@@ -104,5 +105,7 @@ int dm_sysfs_init(struct mapped_device *md) | |||
104 | */ | 105 | */ |
105 | void dm_sysfs_exit(struct mapped_device *md) | 106 | void dm_sysfs_exit(struct mapped_device *md) |
106 | { | 107 | { |
107 | kobject_put(dm_kobject(md)); | 108 | struct kobject *kobj = dm_kobject(md); |
109 | kobject_put(kobj); | ||
110 | wait_for_completion(dm_get_completion_from_kobject(kobj)); | ||
108 | } | 111 | } |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 3ba6a3859ce3..6a7f2b83a126 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -155,7 +155,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num) | |||
155 | { | 155 | { |
156 | sector_t *n_highs; | 156 | sector_t *n_highs; |
157 | struct dm_target *n_targets; | 157 | struct dm_target *n_targets; |
158 | int n = t->num_targets; | ||
159 | 158 | ||
160 | /* | 159 | /* |
161 | * Allocate both the target array and offset array at once. | 160 | * Allocate both the target array and offset array at once. |
@@ -169,12 +168,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num) | |||
169 | 168 | ||
170 | n_targets = (struct dm_target *) (n_highs + num); | 169 | n_targets = (struct dm_target *) (n_highs + num); |
171 | 170 | ||
172 | if (n) { | 171 | memset(n_highs, -1, sizeof(*n_highs) * num); |
173 | memcpy(n_highs, t->highs, sizeof(*n_highs) * n); | ||
174 | memcpy(n_targets, t->targets, sizeof(*n_targets) * n); | ||
175 | } | ||
176 | |||
177 | memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); | ||
178 | vfree(t->highs); | 172 | vfree(t->highs); |
179 | 173 | ||
180 | t->num_allocated = num; | 174 | t->num_allocated = num; |
@@ -261,17 +255,6 @@ void dm_table_destroy(struct dm_table *t) | |||
261 | } | 255 | } |
262 | 256 | ||
263 | /* | 257 | /* |
264 | * Checks to see if we need to extend highs or targets. | ||
265 | */ | ||
266 | static inline int check_space(struct dm_table *t) | ||
267 | { | ||
268 | if (t->num_targets >= t->num_allocated) | ||
269 | return alloc_targets(t, t->num_allocated * 2); | ||
270 | |||
271 | return 0; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * See if we've already got a device in the list. | 258 | * See if we've already got a device in the list. |
276 | */ | 259 | */ |
277 | static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) | 260 | static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) |
@@ -731,8 +714,7 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
731 | return -EINVAL; | 714 | return -EINVAL; |
732 | } | 715 | } |
733 | 716 | ||
734 | if ((r = check_space(t))) | 717 | BUG_ON(t->num_targets >= t->num_allocated); |
735 | return r; | ||
736 | 718 | ||
737 | tgt = t->targets + t->num_targets; | 719 | tgt = t->targets + t->num_targets; |
738 | memset(tgt, 0, sizeof(*tgt)); | 720 | memset(tgt, 0, sizeof(*tgt)); |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 8a30ad54bd46..fb9efc829182 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
@@ -76,7 +76,7 @@ | |||
76 | 76 | ||
77 | #define THIN_SUPERBLOCK_MAGIC 27022010 | 77 | #define THIN_SUPERBLOCK_MAGIC 27022010 |
78 | #define THIN_SUPERBLOCK_LOCATION 0 | 78 | #define THIN_SUPERBLOCK_LOCATION 0 |
79 | #define THIN_VERSION 1 | 79 | #define THIN_VERSION 2 |
80 | #define THIN_METADATA_CACHE_SIZE 64 | 80 | #define THIN_METADATA_CACHE_SIZE 64 |
81 | #define SECTOR_TO_BLOCK_SHIFT 3 | 81 | #define SECTOR_TO_BLOCK_SHIFT 3 |
82 | 82 | ||
@@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) | |||
483 | 483 | ||
484 | disk_super->data_mapping_root = cpu_to_le64(pmd->root); | 484 | disk_super->data_mapping_root = cpu_to_le64(pmd->root); |
485 | disk_super->device_details_root = cpu_to_le64(pmd->details_root); | 485 | disk_super->device_details_root = cpu_to_le64(pmd->details_root); |
486 | disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | 486 | disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE); |
487 | disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); | 487 | disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); |
488 | disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); | 488 | disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); |
489 | 489 | ||
@@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f | |||
651 | { | 651 | { |
652 | int r; | 652 | int r; |
653 | 653 | ||
654 | pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE, | 654 | pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, |
655 | THIN_METADATA_CACHE_SIZE, | 655 | THIN_METADATA_CACHE_SIZE, |
656 | THIN_MAX_CONCURRENT_LOCKS); | 656 | THIN_MAX_CONCURRENT_LOCKS); |
657 | if (IS_ERR(pmd->bm)) { | 657 | if (IS_ERR(pmd->bm)) { |
@@ -1349,6 +1349,12 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) | |||
1349 | return td->id; | 1349 | return td->id; |
1350 | } | 1350 | } |
1351 | 1351 | ||
1352 | /* | ||
1353 | * Check whether @time (of block creation) is older than @td's last snapshot. | ||
1354 | * If so then the associated block is shared with the last snapshot device. | ||
1355 | * Any block on a device created *after* the device last got snapshotted is | ||
1356 | * necessarily not shared. | ||
1357 | */ | ||
1352 | static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) | 1358 | static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) |
1353 | { | 1359 | { |
1354 | return td->snapshotted_time > time; | 1360 | return td->snapshotted_time > time; |
@@ -1458,6 +1464,20 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) | |||
1458 | return r; | 1464 | return r; |
1459 | } | 1465 | } |
1460 | 1466 | ||
1467 | int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) | ||
1468 | { | ||
1469 | int r; | ||
1470 | uint32_t ref_count; | ||
1471 | |||
1472 | down_read(&pmd->root_lock); | ||
1473 | r = dm_sm_get_count(pmd->data_sm, b, &ref_count); | ||
1474 | if (!r) | ||
1475 | *result = (ref_count != 0); | ||
1476 | up_read(&pmd->root_lock); | ||
1477 | |||
1478 | return r; | ||
1479 | } | ||
1480 | |||
1461 | bool dm_thin_changed_this_transaction(struct dm_thin_device *td) | 1481 | bool dm_thin_changed_this_transaction(struct dm_thin_device *td) |
1462 | { | 1482 | { |
1463 | int r; | 1483 | int r; |
@@ -1469,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td) | |||
1469 | return r; | 1489 | return r; |
1470 | } | 1490 | } |
1471 | 1491 | ||
1492 | bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd) | ||
1493 | { | ||
1494 | bool r = false; | ||
1495 | struct dm_thin_device *td, *tmp; | ||
1496 | |||
1497 | down_read(&pmd->root_lock); | ||
1498 | list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { | ||
1499 | if (td->changed) { | ||
1500 | r = td->changed; | ||
1501 | break; | ||
1502 | } | ||
1503 | } | ||
1504 | up_read(&pmd->root_lock); | ||
1505 | |||
1506 | return r; | ||
1507 | } | ||
1508 | |||
1472 | bool dm_thin_aborted_changes(struct dm_thin_device *td) | 1509 | bool dm_thin_aborted_changes(struct dm_thin_device *td) |
1473 | { | 1510 | { |
1474 | bool r; | 1511 | bool r; |
@@ -1718,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, | |||
1718 | 1755 | ||
1719 | return r; | 1756 | return r; |
1720 | } | 1757 | } |
1758 | |||
1759 | int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) | ||
1760 | { | ||
1761 | int r; | ||
1762 | struct dm_block *sblock; | ||
1763 | struct thin_disk_superblock *disk_super; | ||
1764 | |||
1765 | down_write(&pmd->root_lock); | ||
1766 | pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; | ||
1767 | |||
1768 | r = superblock_lock(pmd, &sblock); | ||
1769 | if (r) { | ||
1770 | DMERR("couldn't read superblock"); | ||
1771 | goto out; | ||
1772 | } | ||
1773 | |||
1774 | disk_super = dm_block_data(sblock); | ||
1775 | disk_super->flags = cpu_to_le32(pmd->flags); | ||
1776 | |||
1777 | dm_bm_unlock(sblock); | ||
1778 | out: | ||
1779 | up_write(&pmd->root_lock); | ||
1780 | return r; | ||
1781 | } | ||
1782 | |||
1783 | bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) | ||
1784 | { | ||
1785 | bool needs_check; | ||
1786 | |||
1787 | down_read(&pmd->root_lock); | ||
1788 | needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; | ||
1789 | up_read(&pmd->root_lock); | ||
1790 | |||
1791 | return needs_check; | ||
1792 | } | ||
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 7bcc0e1d6238..e3c857db195a 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h | |||
@@ -9,16 +9,14 @@ | |||
9 | 9 | ||
10 | #include "persistent-data/dm-block-manager.h" | 10 | #include "persistent-data/dm-block-manager.h" |
11 | #include "persistent-data/dm-space-map.h" | 11 | #include "persistent-data/dm-space-map.h" |
12 | #include "persistent-data/dm-space-map-metadata.h" | ||
12 | 13 | ||
13 | #define THIN_METADATA_BLOCK_SIZE 4096 | 14 | #define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE |
14 | 15 | ||
15 | /* | 16 | /* |
16 | * The metadata device is currently limited in size. | 17 | * The metadata device is currently limited in size. |
17 | * | ||
18 | * We have one block of index, which can hold 255 index entries. Each | ||
19 | * index entry contains allocation info about 16k metadata blocks. | ||
20 | */ | 18 | */ |
21 | #define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | 19 | #define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS |
22 | 20 | ||
23 | /* | 21 | /* |
24 | * A metadata device larger than 16GB triggers a warning. | 22 | * A metadata device larger than 16GB triggers a warning. |
@@ -27,6 +25,11 @@ | |||
27 | 25 | ||
28 | /*----------------------------------------------------------------*/ | 26 | /*----------------------------------------------------------------*/ |
29 | 27 | ||
28 | /* | ||
29 | * Thin metadata superblock flags. | ||
30 | */ | ||
31 | #define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0) | ||
32 | |||
30 | struct dm_pool_metadata; | 33 | struct dm_pool_metadata; |
31 | struct dm_thin_device; | 34 | struct dm_thin_device; |
32 | 35 | ||
@@ -131,7 +134,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td); | |||
131 | 134 | ||
132 | struct dm_thin_lookup_result { | 135 | struct dm_thin_lookup_result { |
133 | dm_block_t block; | 136 | dm_block_t block; |
134 | unsigned shared:1; | 137 | bool shared:1; |
135 | }; | 138 | }; |
136 | 139 | ||
137 | /* | 140 | /* |
@@ -161,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); | |||
161 | */ | 164 | */ |
162 | bool dm_thin_changed_this_transaction(struct dm_thin_device *td); | 165 | bool dm_thin_changed_this_transaction(struct dm_thin_device *td); |
163 | 166 | ||
167 | bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd); | ||
168 | |||
164 | bool dm_thin_aborted_changes(struct dm_thin_device *td); | 169 | bool dm_thin_aborted_changes(struct dm_thin_device *td); |
165 | 170 | ||
166 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, | 171 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, |
@@ -181,6 +186,8 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result); | |||
181 | 186 | ||
182 | int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); | 187 | int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); |
183 | 188 | ||
189 | int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); | ||
190 | |||
184 | /* | 191 | /* |
185 | * Returns -ENOSPC if the new size is too small and already allocated | 192 | * Returns -ENOSPC if the new size is too small and already allocated |
186 | * blocks would be lost. | 193 | * blocks would be lost. |
@@ -200,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, | |||
200 | dm_sm_threshold_fn fn, | 207 | dm_sm_threshold_fn fn, |
201 | void *context); | 208 | void *context); |
202 | 209 | ||
210 | /* | ||
211 | * Updates the superblock immediately. | ||
212 | */ | ||
213 | int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); | ||
214 | bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); | ||
215 | |||
203 | /*----------------------------------------------------------------*/ | 216 | /*----------------------------------------------------------------*/ |
204 | 217 | ||
205 | #endif | 218 | #endif |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ee29037ffc2e..be70d38745f7 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, | |||
130 | struct dm_thin_new_mapping; | 130 | struct dm_thin_new_mapping; |
131 | 131 | ||
132 | /* | 132 | /* |
133 | * The pool runs in 3 modes. Ordered in degraded order for comparisons. | 133 | * The pool runs in 4 modes. Ordered in degraded order for comparisons. |
134 | */ | 134 | */ |
135 | enum pool_mode { | 135 | enum pool_mode { |
136 | PM_WRITE, /* metadata may be changed */ | 136 | PM_WRITE, /* metadata may be changed */ |
137 | PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ | ||
137 | PM_READ_ONLY, /* metadata may not be changed */ | 138 | PM_READ_ONLY, /* metadata may not be changed */ |
138 | PM_FAIL, /* all I/O fails */ | 139 | PM_FAIL, /* all I/O fails */ |
139 | }; | 140 | }; |
@@ -144,6 +145,7 @@ struct pool_features { | |||
144 | bool zero_new_blocks:1; | 145 | bool zero_new_blocks:1; |
145 | bool discard_enabled:1; | 146 | bool discard_enabled:1; |
146 | bool discard_passdown:1; | 147 | bool discard_passdown:1; |
148 | bool error_if_no_space:1; | ||
147 | }; | 149 | }; |
148 | 150 | ||
149 | struct thin_c; | 151 | struct thin_c; |
@@ -163,8 +165,7 @@ struct pool { | |||
163 | int sectors_per_block_shift; | 165 | int sectors_per_block_shift; |
164 | 166 | ||
165 | struct pool_features pf; | 167 | struct pool_features pf; |
166 | unsigned low_water_triggered:1; /* A dm event has been sent */ | 168 | bool low_water_triggered:1; /* A dm event has been sent */ |
167 | unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ | ||
168 | 169 | ||
169 | struct dm_bio_prison *prison; | 170 | struct dm_bio_prison *prison; |
170 | struct dm_kcopyd_client *copier; | 171 | struct dm_kcopyd_client *copier; |
@@ -198,7 +199,7 @@ struct pool { | |||
198 | }; | 199 | }; |
199 | 200 | ||
200 | static enum pool_mode get_pool_mode(struct pool *pool); | 201 | static enum pool_mode get_pool_mode(struct pool *pool); |
201 | static void set_pool_mode(struct pool *pool, enum pool_mode mode); | 202 | static void metadata_operation_failed(struct pool *pool, const char *op, int r); |
202 | 203 | ||
203 | /* | 204 | /* |
204 | * Target context for a pool. | 205 | * Target context for a pool. |
@@ -225,6 +226,7 @@ struct thin_c { | |||
225 | 226 | ||
226 | struct pool *pool; | 227 | struct pool *pool; |
227 | struct dm_thin_device *td; | 228 | struct dm_thin_device *td; |
229 | bool requeue_mode:1; | ||
228 | }; | 230 | }; |
229 | 231 | ||
230 | /*----------------------------------------------------------------*/ | 232 | /*----------------------------------------------------------------*/ |
@@ -368,14 +370,18 @@ struct dm_thin_endio_hook { | |||
368 | struct dm_thin_new_mapping *overwrite_mapping; | 370 | struct dm_thin_new_mapping *overwrite_mapping; |
369 | }; | 371 | }; |
370 | 372 | ||
371 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | 373 | static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
372 | { | 374 | { |
373 | struct bio *bio; | 375 | struct bio *bio; |
374 | struct bio_list bios; | 376 | struct bio_list bios; |
377 | unsigned long flags; | ||
375 | 378 | ||
376 | bio_list_init(&bios); | 379 | bio_list_init(&bios); |
380 | |||
381 | spin_lock_irqsave(&tc->pool->lock, flags); | ||
377 | bio_list_merge(&bios, master); | 382 | bio_list_merge(&bios, master); |
378 | bio_list_init(master); | 383 | bio_list_init(master); |
384 | spin_unlock_irqrestore(&tc->pool->lock, flags); | ||
379 | 385 | ||
380 | while ((bio = bio_list_pop(&bios))) { | 386 | while ((bio = bio_list_pop(&bios))) { |
381 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 387 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); |
@@ -390,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | |||
390 | static void requeue_io(struct thin_c *tc) | 396 | static void requeue_io(struct thin_c *tc) |
391 | { | 397 | { |
392 | struct pool *pool = tc->pool; | 398 | struct pool *pool = tc->pool; |
399 | |||
400 | requeue_bio_list(tc, &pool->deferred_bios); | ||
401 | requeue_bio_list(tc, &pool->retry_on_resume_list); | ||
402 | } | ||
403 | |||
404 | static void error_retry_list(struct pool *pool) | ||
405 | { | ||
406 | struct bio *bio; | ||
393 | unsigned long flags; | 407 | unsigned long flags; |
408 | struct bio_list bios; | ||
409 | |||
410 | bio_list_init(&bios); | ||
394 | 411 | ||
395 | spin_lock_irqsave(&pool->lock, flags); | 412 | spin_lock_irqsave(&pool->lock, flags); |
396 | __requeue_bio_list(tc, &pool->deferred_bios); | 413 | bio_list_merge(&bios, &pool->retry_on_resume_list); |
397 | __requeue_bio_list(tc, &pool->retry_on_resume_list); | 414 | bio_list_init(&pool->retry_on_resume_list); |
398 | spin_unlock_irqrestore(&pool->lock, flags); | 415 | spin_unlock_irqrestore(&pool->lock, flags); |
416 | |||
417 | while ((bio = bio_list_pop(&bios))) | ||
418 | bio_io_error(bio); | ||
399 | } | 419 | } |
400 | 420 | ||
401 | /* | 421 | /* |
@@ -413,7 +433,7 @@ static bool block_size_is_power_of_two(struct pool *pool) | |||
413 | static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) | 433 | static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) |
414 | { | 434 | { |
415 | struct pool *pool = tc->pool; | 435 | struct pool *pool = tc->pool; |
416 | sector_t block_nr = bio->bi_sector; | 436 | sector_t block_nr = bio->bi_iter.bi_sector; |
417 | 437 | ||
418 | if (block_size_is_power_of_two(pool)) | 438 | if (block_size_is_power_of_two(pool)) |
419 | block_nr >>= pool->sectors_per_block_shift; | 439 | block_nr >>= pool->sectors_per_block_shift; |
@@ -426,14 +446,15 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) | |||
426 | static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) | 446 | static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) |
427 | { | 447 | { |
428 | struct pool *pool = tc->pool; | 448 | struct pool *pool = tc->pool; |
429 | sector_t bi_sector = bio->bi_sector; | 449 | sector_t bi_sector = bio->bi_iter.bi_sector; |
430 | 450 | ||
431 | bio->bi_bdev = tc->pool_dev->bdev; | 451 | bio->bi_bdev = tc->pool_dev->bdev; |
432 | if (block_size_is_power_of_two(pool)) | 452 | if (block_size_is_power_of_two(pool)) |
433 | bio->bi_sector = (block << pool->sectors_per_block_shift) | | 453 | bio->bi_iter.bi_sector = |
434 | (bi_sector & (pool->sectors_per_block - 1)); | 454 | (block << pool->sectors_per_block_shift) | |
455 | (bi_sector & (pool->sectors_per_block - 1)); | ||
435 | else | 456 | else |
436 | bio->bi_sector = (block * pool->sectors_per_block) + | 457 | bio->bi_iter.bi_sector = (block * pool->sectors_per_block) + |
437 | sector_div(bi_sector, pool->sectors_per_block); | 458 | sector_div(bi_sector, pool->sectors_per_block); |
438 | } | 459 | } |
439 | 460 | ||
@@ -509,15 +530,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, | |||
509 | struct dm_thin_new_mapping { | 530 | struct dm_thin_new_mapping { |
510 | struct list_head list; | 531 | struct list_head list; |
511 | 532 | ||
512 | unsigned quiesced:1; | 533 | bool quiesced:1; |
513 | unsigned prepared:1; | 534 | bool prepared:1; |
514 | unsigned pass_discard:1; | 535 | bool pass_discard:1; |
536 | bool definitely_not_shared:1; | ||
515 | 537 | ||
538 | int err; | ||
516 | struct thin_c *tc; | 539 | struct thin_c *tc; |
517 | dm_block_t virt_block; | 540 | dm_block_t virt_block; |
518 | dm_block_t data_block; | 541 | dm_block_t data_block; |
519 | struct dm_bio_prison_cell *cell, *cell2; | 542 | struct dm_bio_prison_cell *cell, *cell2; |
520 | int err; | ||
521 | 543 | ||
522 | /* | 544 | /* |
523 | * If the bio covers the whole area of a block then we can avoid | 545 | * If the bio covers the whole area of a block then we can avoid |
@@ -534,7 +556,7 @@ static void __maybe_add_mapping(struct dm_thin_new_mapping *m) | |||
534 | struct pool *pool = m->tc->pool; | 556 | struct pool *pool = m->tc->pool; |
535 | 557 | ||
536 | if (m->quiesced && m->prepared) { | 558 | if (m->quiesced && m->prepared) { |
537 | list_add(&m->list, &pool->prepared_mappings); | 559 | list_add_tail(&m->list, &pool->prepared_mappings); |
538 | wake_worker(pool); | 560 | wake_worker(pool); |
539 | } | 561 | } |
540 | } | 562 | } |
@@ -548,7 +570,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) | |||
548 | m->err = read_err || write_err ? -EIO : 0; | 570 | m->err = read_err || write_err ? -EIO : 0; |
549 | 571 | ||
550 | spin_lock_irqsave(&pool->lock, flags); | 572 | spin_lock_irqsave(&pool->lock, flags); |
551 | m->prepared = 1; | 573 | m->prepared = true; |
552 | __maybe_add_mapping(m); | 574 | __maybe_add_mapping(m); |
553 | spin_unlock_irqrestore(&pool->lock, flags); | 575 | spin_unlock_irqrestore(&pool->lock, flags); |
554 | } | 576 | } |
@@ -563,7 +585,7 @@ static void overwrite_endio(struct bio *bio, int err) | |||
563 | m->err = err; | 585 | m->err = err; |
564 | 586 | ||
565 | spin_lock_irqsave(&pool->lock, flags); | 587 | spin_lock_irqsave(&pool->lock, flags); |
566 | m->prepared = 1; | 588 | m->prepared = true; |
567 | __maybe_add_mapping(m); | 589 | __maybe_add_mapping(m); |
568 | spin_unlock_irqrestore(&pool->lock, flags); | 590 | spin_unlock_irqrestore(&pool->lock, flags); |
569 | } | 591 | } |
@@ -610,8 +632,10 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c | |||
610 | 632 | ||
611 | static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) | 633 | static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) |
612 | { | 634 | { |
613 | if (m->bio) | 635 | if (m->bio) { |
614 | m->bio->bi_end_io = m->saved_bi_end_io; | 636 | m->bio->bi_end_io = m->saved_bi_end_io; |
637 | atomic_inc(&m->bio->bi_remaining); | ||
638 | } | ||
615 | cell_error(m->tc->pool, m->cell); | 639 | cell_error(m->tc->pool, m->cell); |
616 | list_del(&m->list); | 640 | list_del(&m->list); |
617 | mempool_free(m, m->tc->pool->mapping_pool); | 641 | mempool_free(m, m->tc->pool->mapping_pool); |
@@ -625,8 +649,10 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
625 | int r; | 649 | int r; |
626 | 650 | ||
627 | bio = m->bio; | 651 | bio = m->bio; |
628 | if (bio) | 652 | if (bio) { |
629 | bio->bi_end_io = m->saved_bi_end_io; | 653 | bio->bi_end_io = m->saved_bi_end_io; |
654 | atomic_inc(&bio->bi_remaining); | ||
655 | } | ||
630 | 656 | ||
631 | if (m->err) { | 657 | if (m->err) { |
632 | cell_error(pool, m->cell); | 658 | cell_error(pool, m->cell); |
@@ -640,9 +666,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
640 | */ | 666 | */ |
641 | r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); | 667 | r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); |
642 | if (r) { | 668 | if (r) { |
643 | DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", | 669 | metadata_operation_failed(pool, "dm_thin_insert_block", r); |
644 | dm_device_name(pool->pool_md), r); | ||
645 | set_pool_mode(pool, PM_READ_ONLY); | ||
646 | cell_error(pool, m->cell); | 670 | cell_error(pool, m->cell); |
647 | goto out; | 671 | goto out; |
648 | } | 672 | } |
@@ -683,7 +707,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) | |||
683 | cell_defer_no_holder(tc, m->cell2); | 707 | cell_defer_no_holder(tc, m->cell2); |
684 | 708 | ||
685 | if (m->pass_discard) | 709 | if (m->pass_discard) |
686 | remap_and_issue(tc, m->bio, m->data_block); | 710 | if (m->definitely_not_shared) |
711 | remap_and_issue(tc, m->bio, m->data_block); | ||
712 | else { | ||
713 | bool used = false; | ||
714 | if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used) | ||
715 | bio_endio(m->bio, 0); | ||
716 | else | ||
717 | remap_and_issue(tc, m->bio, m->data_block); | ||
718 | } | ||
687 | else | 719 | else |
688 | bio_endio(m->bio, 0); | 720 | bio_endio(m->bio, 0); |
689 | 721 | ||
@@ -723,7 +755,8 @@ static void process_prepared(struct pool *pool, struct list_head *head, | |||
723 | */ | 755 | */ |
724 | static int io_overlaps_block(struct pool *pool, struct bio *bio) | 756 | static int io_overlaps_block(struct pool *pool, struct bio *bio) |
725 | { | 757 | { |
726 | return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); | 758 | return bio->bi_iter.bi_size == |
759 | (pool->sectors_per_block << SECTOR_SHIFT); | ||
727 | } | 760 | } |
728 | 761 | ||
729 | static int io_overwrites_block(struct pool *pool, struct bio *bio) | 762 | static int io_overwrites_block(struct pool *pool, struct bio *bio) |
@@ -751,13 +784,17 @@ static int ensure_next_mapping(struct pool *pool) | |||
751 | 784 | ||
752 | static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) | 785 | static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) |
753 | { | 786 | { |
754 | struct dm_thin_new_mapping *r = pool->next_mapping; | 787 | struct dm_thin_new_mapping *m = pool->next_mapping; |
755 | 788 | ||
756 | BUG_ON(!pool->next_mapping); | 789 | BUG_ON(!pool->next_mapping); |
757 | 790 | ||
791 | memset(m, 0, sizeof(struct dm_thin_new_mapping)); | ||
792 | INIT_LIST_HEAD(&m->list); | ||
793 | m->bio = NULL; | ||
794 | |||
758 | pool->next_mapping = NULL; | 795 | pool->next_mapping = NULL; |
759 | 796 | ||
760 | return r; | 797 | return m; |
761 | } | 798 | } |
762 | 799 | ||
763 | static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | 800 | static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, |
@@ -769,18 +806,13 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
769 | struct pool *pool = tc->pool; | 806 | struct pool *pool = tc->pool; |
770 | struct dm_thin_new_mapping *m = get_next_mapping(pool); | 807 | struct dm_thin_new_mapping *m = get_next_mapping(pool); |
771 | 808 | ||
772 | INIT_LIST_HEAD(&m->list); | ||
773 | m->quiesced = 0; | ||
774 | m->prepared = 0; | ||
775 | m->tc = tc; | 809 | m->tc = tc; |
776 | m->virt_block = virt_block; | 810 | m->virt_block = virt_block; |
777 | m->data_block = data_dest; | 811 | m->data_block = data_dest; |
778 | m->cell = cell; | 812 | m->cell = cell; |
779 | m->err = 0; | ||
780 | m->bio = NULL; | ||
781 | 813 | ||
782 | if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) | 814 | if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) |
783 | m->quiesced = 1; | 815 | m->quiesced = true; |
784 | 816 | ||
785 | /* | 817 | /* |
786 | * IO to pool_dev remaps to the pool target's data_dev. | 818 | * IO to pool_dev remaps to the pool target's data_dev. |
@@ -840,15 +872,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
840 | struct pool *pool = tc->pool; | 872 | struct pool *pool = tc->pool; |
841 | struct dm_thin_new_mapping *m = get_next_mapping(pool); | 873 | struct dm_thin_new_mapping *m = get_next_mapping(pool); |
842 | 874 | ||
843 | INIT_LIST_HEAD(&m->list); | 875 | m->quiesced = true; |
844 | m->quiesced = 1; | 876 | m->prepared = false; |
845 | m->prepared = 0; | ||
846 | m->tc = tc; | 877 | m->tc = tc; |
847 | m->virt_block = virt_block; | 878 | m->virt_block = virt_block; |
848 | m->data_block = data_block; | 879 | m->data_block = data_block; |
849 | m->cell = cell; | 880 | m->cell = cell; |
850 | m->err = 0; | ||
851 | m->bio = NULL; | ||
852 | 881 | ||
853 | /* | 882 | /* |
854 | * If the whole block of data is being overwritten or we are not | 883 | * If the whole block of data is being overwritten or we are not |
@@ -895,41 +924,44 @@ static int commit(struct pool *pool) | |||
895 | return -EINVAL; | 924 | return -EINVAL; |
896 | 925 | ||
897 | r = dm_pool_commit_metadata(pool->pmd); | 926 | r = dm_pool_commit_metadata(pool->pmd); |
898 | if (r) { | 927 | if (r) |
899 | DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", | 928 | metadata_operation_failed(pool, "dm_pool_commit_metadata", r); |
900 | dm_device_name(pool->pool_md), r); | ||
901 | set_pool_mode(pool, PM_READ_ONLY); | ||
902 | } | ||
903 | 929 | ||
904 | return r; | 930 | return r; |
905 | } | 931 | } |
906 | 932 | ||
907 | static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | 933 | static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) |
908 | { | 934 | { |
909 | int r; | ||
910 | dm_block_t free_blocks; | ||
911 | unsigned long flags; | 935 | unsigned long flags; |
912 | struct pool *pool = tc->pool; | ||
913 | |||
914 | /* | ||
915 | * Once no_free_space is set we must not allow allocation to succeed. | ||
916 | * Otherwise it is difficult to explain, debug, test and support. | ||
917 | */ | ||
918 | if (pool->no_free_space) | ||
919 | return -ENOSPC; | ||
920 | |||
921 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); | ||
922 | if (r) | ||
923 | return r; | ||
924 | 936 | ||
925 | if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { | 937 | if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { |
926 | DMWARN("%s: reached low water mark for data device: sending event.", | 938 | DMWARN("%s: reached low water mark for data device: sending event.", |
927 | dm_device_name(pool->pool_md)); | 939 | dm_device_name(pool->pool_md)); |
928 | spin_lock_irqsave(&pool->lock, flags); | 940 | spin_lock_irqsave(&pool->lock, flags); |
929 | pool->low_water_triggered = 1; | 941 | pool->low_water_triggered = true; |
930 | spin_unlock_irqrestore(&pool->lock, flags); | 942 | spin_unlock_irqrestore(&pool->lock, flags); |
931 | dm_table_event(pool->ti->table); | 943 | dm_table_event(pool->ti->table); |
932 | } | 944 | } |
945 | } | ||
946 | |||
947 | static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); | ||
948 | |||
949 | static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | ||
950 | { | ||
951 | int r; | ||
952 | dm_block_t free_blocks; | ||
953 | struct pool *pool = tc->pool; | ||
954 | |||
955 | if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) | ||
956 | return -EINVAL; | ||
957 | |||
958 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); | ||
959 | if (r) { | ||
960 | metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); | ||
961 | return r; | ||
962 | } | ||
963 | |||
964 | check_low_water_mark(pool, free_blocks); | ||
933 | 965 | ||
934 | if (!free_blocks) { | 966 | if (!free_blocks) { |
935 | /* | 967 | /* |
@@ -941,35 +973,20 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
941 | return r; | 973 | return r; |
942 | 974 | ||
943 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); | 975 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); |
944 | if (r) | 976 | if (r) { |
977 | metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); | ||
945 | return r; | 978 | return r; |
979 | } | ||
946 | 980 | ||
947 | /* | ||
948 | * If we still have no space we set a flag to avoid | ||
949 | * doing all this checking and return -ENOSPC. This | ||
950 | * flag serves as a latch that disallows allocations from | ||
951 | * this pool until the admin takes action (e.g. resize or | ||
952 | * table reload). | ||
953 | */ | ||
954 | if (!free_blocks) { | 981 | if (!free_blocks) { |
955 | DMWARN("%s: no free data space available.", | 982 | set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); |
956 | dm_device_name(pool->pool_md)); | ||
957 | spin_lock_irqsave(&pool->lock, flags); | ||
958 | pool->no_free_space = 1; | ||
959 | spin_unlock_irqrestore(&pool->lock, flags); | ||
960 | return -ENOSPC; | 983 | return -ENOSPC; |
961 | } | 984 | } |
962 | } | 985 | } |
963 | 986 | ||
964 | r = dm_pool_alloc_data_block(pool->pmd, result); | 987 | r = dm_pool_alloc_data_block(pool->pmd, result); |
965 | if (r) { | 988 | if (r) { |
966 | if (r == -ENOSPC && | 989 | metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); |
967 | !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && | ||
968 | !free_blocks) { | ||
969 | DMWARN("%s: no free metadata space available.", | ||
970 | dm_device_name(pool->pool_md)); | ||
971 | set_pool_mode(pool, PM_READ_ONLY); | ||
972 | } | ||
973 | return r; | 990 | return r; |
974 | } | 991 | } |
975 | 992 | ||
@@ -992,16 +1009,56 @@ static void retry_on_resume(struct bio *bio) | |||
992 | spin_unlock_irqrestore(&pool->lock, flags); | 1009 | spin_unlock_irqrestore(&pool->lock, flags); |
993 | } | 1010 | } |
994 | 1011 | ||
995 | static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell) | 1012 | static bool should_error_unserviceable_bio(struct pool *pool) |
1013 | { | ||
1014 | enum pool_mode m = get_pool_mode(pool); | ||
1015 | |||
1016 | switch (m) { | ||
1017 | case PM_WRITE: | ||
1018 | /* Shouldn't get here */ | ||
1019 | DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); | ||
1020 | return true; | ||
1021 | |||
1022 | case PM_OUT_OF_DATA_SPACE: | ||
1023 | return pool->pf.error_if_no_space; | ||
1024 | |||
1025 | case PM_READ_ONLY: | ||
1026 | case PM_FAIL: | ||
1027 | return true; | ||
1028 | default: | ||
1029 | /* Shouldn't get here */ | ||
1030 | DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); | ||
1031 | return true; | ||
1032 | } | ||
1033 | } | ||
1034 | |||
1035 | static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) | ||
1036 | { | ||
1037 | if (should_error_unserviceable_bio(pool)) | ||
1038 | bio_io_error(bio); | ||
1039 | else | ||
1040 | retry_on_resume(bio); | ||
1041 | } | ||
1042 | |||
1043 | static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell) | ||
996 | { | 1044 | { |
997 | struct bio *bio; | 1045 | struct bio *bio; |
998 | struct bio_list bios; | 1046 | struct bio_list bios; |
999 | 1047 | ||
1048 | if (should_error_unserviceable_bio(pool)) { | ||
1049 | cell_error(pool, cell); | ||
1050 | return; | ||
1051 | } | ||
1052 | |||
1000 | bio_list_init(&bios); | 1053 | bio_list_init(&bios); |
1001 | cell_release(pool, cell, &bios); | 1054 | cell_release(pool, cell, &bios); |
1002 | 1055 | ||
1003 | while ((bio = bio_list_pop(&bios))) | 1056 | if (should_error_unserviceable_bio(pool)) |
1004 | retry_on_resume(bio); | 1057 | while ((bio = bio_list_pop(&bios))) |
1058 | bio_io_error(bio); | ||
1059 | else | ||
1060 | while ((bio = bio_list_pop(&bios))) | ||
1061 | retry_on_resume(bio); | ||
1005 | } | 1062 | } |
1006 | 1063 | ||
1007 | static void process_discard(struct thin_c *tc, struct bio *bio) | 1064 | static void process_discard(struct thin_c *tc, struct bio *bio) |
@@ -1040,17 +1097,17 @@ static void process_discard(struct thin_c *tc, struct bio *bio) | |||
1040 | */ | 1097 | */ |
1041 | m = get_next_mapping(pool); | 1098 | m = get_next_mapping(pool); |
1042 | m->tc = tc; | 1099 | m->tc = tc; |
1043 | m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; | 1100 | m->pass_discard = pool->pf.discard_passdown; |
1101 | m->definitely_not_shared = !lookup_result.shared; | ||
1044 | m->virt_block = block; | 1102 | m->virt_block = block; |
1045 | m->data_block = lookup_result.block; | 1103 | m->data_block = lookup_result.block; |
1046 | m->cell = cell; | 1104 | m->cell = cell; |
1047 | m->cell2 = cell2; | 1105 | m->cell2 = cell2; |
1048 | m->err = 0; | ||
1049 | m->bio = bio; | 1106 | m->bio = bio; |
1050 | 1107 | ||
1051 | if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { | 1108 | if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { |
1052 | spin_lock_irqsave(&pool->lock, flags); | 1109 | spin_lock_irqsave(&pool->lock, flags); |
1053 | list_add(&m->list, &pool->prepared_discards); | 1110 | list_add_tail(&m->list, &pool->prepared_discards); |
1054 | spin_unlock_irqrestore(&pool->lock, flags); | 1111 | spin_unlock_irqrestore(&pool->lock, flags); |
1055 | wake_worker(pool); | 1112 | wake_worker(pool); |
1056 | } | 1113 | } |
@@ -1105,13 +1162,12 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, | |||
1105 | break; | 1162 | break; |
1106 | 1163 | ||
1107 | case -ENOSPC: | 1164 | case -ENOSPC: |
1108 | no_space(pool, cell); | 1165 | retry_bios_on_resume(pool, cell); |
1109 | break; | 1166 | break; |
1110 | 1167 | ||
1111 | default: | 1168 | default: |
1112 | DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", | 1169 | DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", |
1113 | __func__, r); | 1170 | __func__, r); |
1114 | set_pool_mode(pool, PM_READ_ONLY); | ||
1115 | cell_error(pool, cell); | 1171 | cell_error(pool, cell); |
1116 | break; | 1172 | break; |
1117 | } | 1173 | } |
@@ -1133,7 +1189,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, | |||
1133 | if (bio_detain(pool, &key, bio, &cell)) | 1189 | if (bio_detain(pool, &key, bio, &cell)) |
1134 | return; | 1190 | return; |
1135 | 1191 | ||
1136 | if (bio_data_dir(bio) == WRITE && bio->bi_size) | 1192 | if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) |
1137 | break_sharing(tc, bio, block, &key, lookup_result, cell); | 1193 | break_sharing(tc, bio, block, &key, lookup_result, cell); |
1138 | else { | 1194 | else { |
1139 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 1195 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); |
@@ -1156,7 +1212,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
1156 | /* | 1212 | /* |
1157 | * Remap empty bios (flushes) immediately, without provisioning. | 1213 | * Remap empty bios (flushes) immediately, without provisioning. |
1158 | */ | 1214 | */ |
1159 | if (!bio->bi_size) { | 1215 | if (!bio->bi_iter.bi_size) { |
1160 | inc_all_io_entry(pool, bio); | 1216 | inc_all_io_entry(pool, bio); |
1161 | cell_defer_no_holder(tc, cell); | 1217 | cell_defer_no_holder(tc, cell); |
1162 | 1218 | ||
@@ -1184,13 +1240,12 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
1184 | break; | 1240 | break; |
1185 | 1241 | ||
1186 | case -ENOSPC: | 1242 | case -ENOSPC: |
1187 | no_space(pool, cell); | 1243 | retry_bios_on_resume(pool, cell); |
1188 | break; | 1244 | break; |
1189 | 1245 | ||
1190 | default: | 1246 | default: |
1191 | DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", | 1247 | DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", |
1192 | __func__, r); | 1248 | __func__, r); |
1193 | set_pool_mode(pool, PM_READ_ONLY); | ||
1194 | cell_error(pool, cell); | 1249 | cell_error(pool, cell); |
1195 | break; | 1250 | break; |
1196 | } | 1251 | } |
@@ -1256,8 +1311,8 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) | |||
1256 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); | 1311 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); |
1257 | switch (r) { | 1312 | switch (r) { |
1258 | case 0: | 1313 | case 0: |
1259 | if (lookup_result.shared && (rw == WRITE) && bio->bi_size) | 1314 | if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) |
1260 | bio_io_error(bio); | 1315 | handle_unserviceable_bio(tc->pool, bio); |
1261 | else { | 1316 | else { |
1262 | inc_all_io_entry(tc->pool, bio); | 1317 | inc_all_io_entry(tc->pool, bio); |
1263 | remap_and_issue(tc, bio, lookup_result.block); | 1318 | remap_and_issue(tc, bio, lookup_result.block); |
@@ -1266,7 +1321,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) | |||
1266 | 1321 | ||
1267 | case -ENODATA: | 1322 | case -ENODATA: |
1268 | if (rw != READ) { | 1323 | if (rw != READ) { |
1269 | bio_io_error(bio); | 1324 | handle_unserviceable_bio(tc->pool, bio); |
1270 | break; | 1325 | break; |
1271 | } | 1326 | } |
1272 | 1327 | ||
@@ -1288,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) | |||
1288 | } | 1343 | } |
1289 | } | 1344 | } |
1290 | 1345 | ||
1346 | static void process_bio_success(struct thin_c *tc, struct bio *bio) | ||
1347 | { | ||
1348 | bio_endio(bio, 0); | ||
1349 | } | ||
1350 | |||
1291 | static void process_bio_fail(struct thin_c *tc, struct bio *bio) | 1351 | static void process_bio_fail(struct thin_c *tc, struct bio *bio) |
1292 | { | 1352 | { |
1293 | bio_io_error(bio); | 1353 | bio_io_error(bio); |
@@ -1320,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool) | |||
1320 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 1380 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); |
1321 | struct thin_c *tc = h->tc; | 1381 | struct thin_c *tc = h->tc; |
1322 | 1382 | ||
1383 | if (tc->requeue_mode) { | ||
1384 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
1385 | continue; | ||
1386 | } | ||
1387 | |||
1323 | /* | 1388 | /* |
1324 | * If we've got no free new_mapping structs, and processing | 1389 | * If we've got no free new_mapping structs, and processing |
1325 | * this bio might require one, we pause until there are some | 1390 | * this bio might require one, we pause until there are some |
@@ -1349,7 +1414,8 @@ static void process_deferred_bios(struct pool *pool) | |||
1349 | bio_list_init(&pool->deferred_flush_bios); | 1414 | bio_list_init(&pool->deferred_flush_bios); |
1350 | spin_unlock_irqrestore(&pool->lock, flags); | 1415 | spin_unlock_irqrestore(&pool->lock, flags); |
1351 | 1416 | ||
1352 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) | 1417 | if (bio_list_empty(&bios) && |
1418 | !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) | ||
1353 | return; | 1419 | return; |
1354 | 1420 | ||
1355 | if (commit(pool)) { | 1421 | if (commit(pool)) { |
@@ -1385,46 +1451,134 @@ static void do_waker(struct work_struct *ws) | |||
1385 | 1451 | ||
1386 | /*----------------------------------------------------------------*/ | 1452 | /*----------------------------------------------------------------*/ |
1387 | 1453 | ||
1454 | struct noflush_work { | ||
1455 | struct work_struct worker; | ||
1456 | struct thin_c *tc; | ||
1457 | |||
1458 | atomic_t complete; | ||
1459 | wait_queue_head_t wait; | ||
1460 | }; | ||
1461 | |||
1462 | static void complete_noflush_work(struct noflush_work *w) | ||
1463 | { | ||
1464 | atomic_set(&w->complete, 1); | ||
1465 | wake_up(&w->wait); | ||
1466 | } | ||
1467 | |||
1468 | static void do_noflush_start(struct work_struct *ws) | ||
1469 | { | ||
1470 | struct noflush_work *w = container_of(ws, struct noflush_work, worker); | ||
1471 | w->tc->requeue_mode = true; | ||
1472 | requeue_io(w->tc); | ||
1473 | complete_noflush_work(w); | ||
1474 | } | ||
1475 | |||
1476 | static void do_noflush_stop(struct work_struct *ws) | ||
1477 | { | ||
1478 | struct noflush_work *w = container_of(ws, struct noflush_work, worker); | ||
1479 | w->tc->requeue_mode = false; | ||
1480 | complete_noflush_work(w); | ||
1481 | } | ||
1482 | |||
1483 | static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) | ||
1484 | { | ||
1485 | struct noflush_work w; | ||
1486 | |||
1487 | INIT_WORK(&w.worker, fn); | ||
1488 | w.tc = tc; | ||
1489 | atomic_set(&w.complete, 0); | ||
1490 | init_waitqueue_head(&w.wait); | ||
1491 | |||
1492 | queue_work(tc->pool->wq, &w.worker); | ||
1493 | |||
1494 | wait_event(w.wait, atomic_read(&w.complete)); | ||
1495 | } | ||
1496 | |||
1497 | /*----------------------------------------------------------------*/ | ||
1498 | |||
1388 | static enum pool_mode get_pool_mode(struct pool *pool) | 1499 | static enum pool_mode get_pool_mode(struct pool *pool) |
1389 | { | 1500 | { |
1390 | return pool->pf.mode; | 1501 | return pool->pf.mode; |
1391 | } | 1502 | } |
1392 | 1503 | ||
1393 | static void set_pool_mode(struct pool *pool, enum pool_mode mode) | 1504 | static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) |
1394 | { | 1505 | { |
1395 | int r; | 1506 | dm_table_event(pool->ti->table); |
1507 | DMINFO("%s: switching pool to %s mode", | ||
1508 | dm_device_name(pool->pool_md), new_mode); | ||
1509 | } | ||
1396 | 1510 | ||
1397 | pool->pf.mode = mode; | 1511 | static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) |
1512 | { | ||
1513 | struct pool_c *pt = pool->ti->private; | ||
1514 | bool needs_check = dm_pool_metadata_needs_check(pool->pmd); | ||
1515 | enum pool_mode old_mode = get_pool_mode(pool); | ||
1398 | 1516 | ||
1399 | switch (mode) { | 1517 | /* |
1400 | case PM_FAIL: | 1518 | * Never allow the pool to transition to PM_WRITE mode if user |
1401 | DMERR("%s: switching pool to failure mode", | 1519 | * intervention is required to verify metadata and data consistency. |
1520 | */ | ||
1521 | if (new_mode == PM_WRITE && needs_check) { | ||
1522 | DMERR("%s: unable to switch pool to write mode until repaired.", | ||
1402 | dm_device_name(pool->pool_md)); | 1523 | dm_device_name(pool->pool_md)); |
1524 | if (old_mode != new_mode) | ||
1525 | new_mode = old_mode; | ||
1526 | else | ||
1527 | new_mode = PM_READ_ONLY; | ||
1528 | } | ||
1529 | /* | ||
1530 | * If we were in PM_FAIL mode, rollback of metadata failed. We're | ||
1531 | * not going to recover without a thin_repair. So we never let the | ||
1532 | * pool move out of the old mode. | ||
1533 | */ | ||
1534 | if (old_mode == PM_FAIL) | ||
1535 | new_mode = old_mode; | ||
1536 | |||
1537 | switch (new_mode) { | ||
1538 | case PM_FAIL: | ||
1539 | if (old_mode != new_mode) | ||
1540 | notify_of_pool_mode_change(pool, "failure"); | ||
1403 | dm_pool_metadata_read_only(pool->pmd); | 1541 | dm_pool_metadata_read_only(pool->pmd); |
1404 | pool->process_bio = process_bio_fail; | 1542 | pool->process_bio = process_bio_fail; |
1405 | pool->process_discard = process_bio_fail; | 1543 | pool->process_discard = process_bio_fail; |
1406 | pool->process_prepared_mapping = process_prepared_mapping_fail; | 1544 | pool->process_prepared_mapping = process_prepared_mapping_fail; |
1407 | pool->process_prepared_discard = process_prepared_discard_fail; | 1545 | pool->process_prepared_discard = process_prepared_discard_fail; |
1546 | |||
1547 | error_retry_list(pool); | ||
1408 | break; | 1548 | break; |
1409 | 1549 | ||
1410 | case PM_READ_ONLY: | 1550 | case PM_READ_ONLY: |
1411 | DMERR("%s: switching pool to read-only mode", | 1551 | if (old_mode != new_mode) |
1412 | dm_device_name(pool->pool_md)); | 1552 | notify_of_pool_mode_change(pool, "read-only"); |
1413 | r = dm_pool_abort_metadata(pool->pmd); | 1553 | dm_pool_metadata_read_only(pool->pmd); |
1414 | if (r) { | 1554 | pool->process_bio = process_bio_read_only; |
1415 | DMERR("%s: aborting transaction failed", | 1555 | pool->process_discard = process_bio_success; |
1416 | dm_device_name(pool->pool_md)); | 1556 | pool->process_prepared_mapping = process_prepared_mapping_fail; |
1417 | set_pool_mode(pool, PM_FAIL); | 1557 | pool->process_prepared_discard = process_prepared_discard_passdown; |
1418 | } else { | 1558 | |
1419 | dm_pool_metadata_read_only(pool->pmd); | 1559 | error_retry_list(pool); |
1420 | pool->process_bio = process_bio_read_only; | 1560 | break; |
1421 | pool->process_discard = process_discard; | 1561 | |
1422 | pool->process_prepared_mapping = process_prepared_mapping_fail; | 1562 | case PM_OUT_OF_DATA_SPACE: |
1423 | pool->process_prepared_discard = process_prepared_discard_passdown; | 1563 | /* |
1424 | } | 1564 | * Ideally we'd never hit this state; the low water mark |
1565 | * would trigger userland to extend the pool before we | ||
1566 | * completely run out of data space. However, many small | ||
1567 | * IOs to unprovisioned space can consume data space at an | ||
1568 | * alarming rate. Adjust your low water mark if you're | ||
1569 | * frequently seeing this mode. | ||
1570 | */ | ||
1571 | if (old_mode != new_mode) | ||
1572 | notify_of_pool_mode_change(pool, "out-of-data-space"); | ||
1573 | pool->process_bio = process_bio_read_only; | ||
1574 | pool->process_discard = process_discard; | ||
1575 | pool->process_prepared_mapping = process_prepared_mapping; | ||
1576 | pool->process_prepared_discard = process_prepared_discard_passdown; | ||
1425 | break; | 1577 | break; |
1426 | 1578 | ||
1427 | case PM_WRITE: | 1579 | case PM_WRITE: |
1580 | if (old_mode != new_mode) | ||
1581 | notify_of_pool_mode_change(pool, "write"); | ||
1428 | dm_pool_metadata_read_write(pool->pmd); | 1582 | dm_pool_metadata_read_write(pool->pmd); |
1429 | pool->process_bio = process_bio; | 1583 | pool->process_bio = process_bio; |
1430 | pool->process_discard = process_discard; | 1584 | pool->process_discard = process_discard; |
@@ -1432,6 +1586,38 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) | |||
1432 | pool->process_prepared_discard = process_prepared_discard; | 1586 | pool->process_prepared_discard = process_prepared_discard; |
1433 | break; | 1587 | break; |
1434 | } | 1588 | } |
1589 | |||
1590 | pool->pf.mode = new_mode; | ||
1591 | /* | ||
1592 | * The pool mode may have changed, sync it so bind_control_target() | ||
1593 | * doesn't cause an unexpected mode transition on resume. | ||
1594 | */ | ||
1595 | pt->adjusted_pf.mode = new_mode; | ||
1596 | } | ||
1597 | |||
1598 | static void abort_transaction(struct pool *pool) | ||
1599 | { | ||
1600 | const char *dev_name = dm_device_name(pool->pool_md); | ||
1601 | |||
1602 | DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); | ||
1603 | if (dm_pool_abort_metadata(pool->pmd)) { | ||
1604 | DMERR("%s: failed to abort metadata transaction", dev_name); | ||
1605 | set_pool_mode(pool, PM_FAIL); | ||
1606 | } | ||
1607 | |||
1608 | if (dm_pool_metadata_set_needs_check(pool->pmd)) { | ||
1609 | DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); | ||
1610 | set_pool_mode(pool, PM_FAIL); | ||
1611 | } | ||
1612 | } | ||
1613 | |||
1614 | static void metadata_operation_failed(struct pool *pool, const char *op, int r) | ||
1615 | { | ||
1616 | DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", | ||
1617 | dm_device_name(pool->pool_md), op, r); | ||
1618 | |||
1619 | abort_transaction(pool); | ||
1620 | set_pool_mode(pool, PM_READ_ONLY); | ||
1435 | } | 1621 | } |
1436 | 1622 | ||
1437 | /*----------------------------------------------------------------*/ | 1623 | /*----------------------------------------------------------------*/ |
@@ -1481,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) | |||
1481 | 1667 | ||
1482 | thin_hook_bio(tc, bio); | 1668 | thin_hook_bio(tc, bio); |
1483 | 1669 | ||
1670 | if (tc->requeue_mode) { | ||
1671 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
1672 | return DM_MAPIO_SUBMITTED; | ||
1673 | } | ||
1674 | |||
1484 | if (get_pool_mode(tc->pool) == PM_FAIL) { | 1675 | if (get_pool_mode(tc->pool) == PM_FAIL) { |
1485 | bio_io_error(bio); | 1676 | bio_io_error(bio); |
1486 | return DM_MAPIO_SUBMITTED; | 1677 | return DM_MAPIO_SUBMITTED; |
@@ -1538,9 +1729,9 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) | |||
1538 | if (get_pool_mode(tc->pool) == PM_READ_ONLY) { | 1729 | if (get_pool_mode(tc->pool) == PM_READ_ONLY) { |
1539 | /* | 1730 | /* |
1540 | * This block isn't provisioned, and we have no way | 1731 | * This block isn't provisioned, and we have no way |
1541 | * of doing so. Just error it. | 1732 | * of doing so. |
1542 | */ | 1733 | */ |
1543 | bio_io_error(bio); | 1734 | handle_unserviceable_bio(tc->pool, bio); |
1544 | return DM_MAPIO_SUBMITTED; | 1735 | return DM_MAPIO_SUBMITTED; |
1545 | } | 1736 | } |
1546 | /* fall through */ | 1737 | /* fall through */ |
@@ -1644,22 +1835,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
1644 | /* | 1835 | /* |
1645 | * We want to make sure that a pool in PM_FAIL mode is never upgraded. | 1836 | * We want to make sure that a pool in PM_FAIL mode is never upgraded. |
1646 | */ | 1837 | */ |
1647 | enum pool_mode old_mode = pool->pf.mode; | 1838 | enum pool_mode old_mode = get_pool_mode(pool); |
1648 | enum pool_mode new_mode = pt->adjusted_pf.mode; | 1839 | enum pool_mode new_mode = pt->adjusted_pf.mode; |
1649 | 1840 | ||
1650 | /* | 1841 | /* |
1651 | * If we were in PM_FAIL mode, rollback of metadata failed. We're | 1842 | * Don't change the pool's mode until set_pool_mode() below. |
1652 | * not going to recover without a thin_repair. So we never let the | 1843 | * Otherwise the pool's process_* function pointers may |
1653 | * pool move out of the old mode. On the other hand a PM_READ_ONLY | 1844 | * not match the desired pool mode. |
1654 | * may have been due to a lack of metadata or data space, and may | ||
1655 | * now work (ie. if the underlying devices have been resized). | ||
1656 | */ | 1845 | */ |
1657 | if (old_mode == PM_FAIL) | 1846 | pt->adjusted_pf.mode = old_mode; |
1658 | new_mode = old_mode; | ||
1659 | 1847 | ||
1660 | pool->ti = ti; | 1848 | pool->ti = ti; |
1661 | pool->low_water_blocks = pt->low_water_blocks; | ||
1662 | pool->pf = pt->adjusted_pf; | 1849 | pool->pf = pt->adjusted_pf; |
1850 | pool->low_water_blocks = pt->low_water_blocks; | ||
1663 | 1851 | ||
1664 | set_pool_mode(pool, new_mode); | 1852 | set_pool_mode(pool, new_mode); |
1665 | 1853 | ||
@@ -1682,6 +1870,7 @@ static void pool_features_init(struct pool_features *pf) | |||
1682 | pf->zero_new_blocks = true; | 1870 | pf->zero_new_blocks = true; |
1683 | pf->discard_enabled = true; | 1871 | pf->discard_enabled = true; |
1684 | pf->discard_passdown = true; | 1872 | pf->discard_passdown = true; |
1873 | pf->error_if_no_space = false; | ||
1685 | } | 1874 | } |
1686 | 1875 | ||
1687 | static void __pool_destroy(struct pool *pool) | 1876 | static void __pool_destroy(struct pool *pool) |
@@ -1772,8 +1961,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1772 | bio_list_init(&pool->deferred_flush_bios); | 1961 | bio_list_init(&pool->deferred_flush_bios); |
1773 | INIT_LIST_HEAD(&pool->prepared_mappings); | 1962 | INIT_LIST_HEAD(&pool->prepared_mappings); |
1774 | INIT_LIST_HEAD(&pool->prepared_discards); | 1963 | INIT_LIST_HEAD(&pool->prepared_discards); |
1775 | pool->low_water_triggered = 0; | 1964 | pool->low_water_triggered = false; |
1776 | pool->no_free_space = 0; | ||
1777 | bio_list_init(&pool->retry_on_resume_list); | 1965 | bio_list_init(&pool->retry_on_resume_list); |
1778 | 1966 | ||
1779 | pool->shared_read_ds = dm_deferred_set_create(); | 1967 | pool->shared_read_ds = dm_deferred_set_create(); |
@@ -1898,7 +2086,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1898 | const char *arg_name; | 2086 | const char *arg_name; |
1899 | 2087 | ||
1900 | static struct dm_arg _args[] = { | 2088 | static struct dm_arg _args[] = { |
1901 | {0, 3, "Invalid number of pool feature arguments"}, | 2089 | {0, 4, "Invalid number of pool feature arguments"}, |
1902 | }; | 2090 | }; |
1903 | 2091 | ||
1904 | /* | 2092 | /* |
@@ -1927,6 +2115,9 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1927 | else if (!strcasecmp(arg_name, "read_only")) | 2115 | else if (!strcasecmp(arg_name, "read_only")) |
1928 | pf->mode = PM_READ_ONLY; | 2116 | pf->mode = PM_READ_ONLY; |
1929 | 2117 | ||
2118 | else if (!strcasecmp(arg_name, "error_if_no_space")) | ||
2119 | pf->error_if_no_space = true; | ||
2120 | |||
1930 | else { | 2121 | else { |
1931 | ti->error = "Unrecognised pool feature requested"; | 2122 | ti->error = "Unrecognised pool feature requested"; |
1932 | r = -EINVAL; | 2123 | r = -EINVAL; |
@@ -1947,16 +2138,27 @@ static void metadata_low_callback(void *context) | |||
1947 | dm_table_event(pool->ti->table); | 2138 | dm_table_event(pool->ti->table); |
1948 | } | 2139 | } |
1949 | 2140 | ||
1950 | static sector_t get_metadata_dev_size(struct block_device *bdev) | 2141 | static sector_t get_dev_size(struct block_device *bdev) |
1951 | { | 2142 | { |
1952 | sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | 2143 | return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; |
2144 | } | ||
2145 | |||
2146 | static void warn_if_metadata_device_too_big(struct block_device *bdev) | ||
2147 | { | ||
2148 | sector_t metadata_dev_size = get_dev_size(bdev); | ||
1953 | char buffer[BDEVNAME_SIZE]; | 2149 | char buffer[BDEVNAME_SIZE]; |
1954 | 2150 | ||
1955 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { | 2151 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) |
1956 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", | 2152 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", |
1957 | bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); | 2153 | bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); |
1958 | metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; | 2154 | } |
1959 | } | 2155 | |
2156 | static sector_t get_metadata_dev_size(struct block_device *bdev) | ||
2157 | { | ||
2158 | sector_t metadata_dev_size = get_dev_size(bdev); | ||
2159 | |||
2160 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS) | ||
2161 | metadata_dev_size = THIN_METADATA_MAX_SECTORS; | ||
1960 | 2162 | ||
1961 | return metadata_dev_size; | 2163 | return metadata_dev_size; |
1962 | } | 2164 | } |
@@ -1965,7 +2167,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) | |||
1965 | { | 2167 | { |
1966 | sector_t metadata_dev_size = get_metadata_dev_size(bdev); | 2168 | sector_t metadata_dev_size = get_metadata_dev_size(bdev); |
1967 | 2169 | ||
1968 | sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | 2170 | sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE); |
1969 | 2171 | ||
1970 | return metadata_dev_size; | 2172 | return metadata_dev_size; |
1971 | } | 2173 | } |
@@ -1997,6 +2199,8 @@ static dm_block_t calc_metadata_threshold(struct pool_c *pt) | |||
1997 | * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. | 2199 | * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. |
1998 | * ignore_discard: disable discard | 2200 | * ignore_discard: disable discard |
1999 | * no_discard_passdown: don't pass discards down to the data device | 2201 | * no_discard_passdown: don't pass discards down to the data device |
2202 | * read_only: Don't allow any changes to be made to the pool metadata. | ||
2203 | * error_if_no_space: error IOs, instead of queueing, if no space. | ||
2000 | */ | 2204 | */ |
2001 | static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | 2205 | static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) |
2002 | { | 2206 | { |
@@ -2041,12 +2245,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2041 | ti->error = "Error opening metadata block device"; | 2245 | ti->error = "Error opening metadata block device"; |
2042 | goto out_unlock; | 2246 | goto out_unlock; |
2043 | } | 2247 | } |
2044 | 2248 | warn_if_metadata_device_too_big(metadata_dev->bdev); | |
2045 | /* | ||
2046 | * Run for the side-effect of possibly issuing a warning if the | ||
2047 | * device is too big. | ||
2048 | */ | ||
2049 | (void) get_metadata_dev_size(metadata_dev->bdev); | ||
2050 | 2249 | ||
2051 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); | 2250 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); |
2052 | if (r) { | 2251 | if (r) { |
@@ -2192,11 +2391,19 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) | |||
2192 | return -EINVAL; | 2391 | return -EINVAL; |
2193 | 2392 | ||
2194 | } else if (data_size > sb_data_size) { | 2393 | } else if (data_size > sb_data_size) { |
2394 | if (dm_pool_metadata_needs_check(pool->pmd)) { | ||
2395 | DMERR("%s: unable to grow the data device until repaired.", | ||
2396 | dm_device_name(pool->pool_md)); | ||
2397 | return 0; | ||
2398 | } | ||
2399 | |||
2400 | if (sb_data_size) | ||
2401 | DMINFO("%s: growing the data device from %llu to %llu blocks", | ||
2402 | dm_device_name(pool->pool_md), | ||
2403 | sb_data_size, (unsigned long long)data_size); | ||
2195 | r = dm_pool_resize_data_dev(pool->pmd, data_size); | 2404 | r = dm_pool_resize_data_dev(pool->pmd, data_size); |
2196 | if (r) { | 2405 | if (r) { |
2197 | DMERR("%s: failed to resize data device", | 2406 | metadata_operation_failed(pool, "dm_pool_resize_data_dev", r); |
2198 | dm_device_name(pool->pool_md)); | ||
2199 | set_pool_mode(pool, PM_READ_ONLY); | ||
2200 | return r; | 2407 | return r; |
2201 | } | 2408 | } |
2202 | 2409 | ||
@@ -2231,10 +2438,19 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) | |||
2231 | return -EINVAL; | 2438 | return -EINVAL; |
2232 | 2439 | ||
2233 | } else if (metadata_dev_size > sb_metadata_dev_size) { | 2440 | } else if (metadata_dev_size > sb_metadata_dev_size) { |
2441 | if (dm_pool_metadata_needs_check(pool->pmd)) { | ||
2442 | DMERR("%s: unable to grow the metadata device until repaired.", | ||
2443 | dm_device_name(pool->pool_md)); | ||
2444 | return 0; | ||
2445 | } | ||
2446 | |||
2447 | warn_if_metadata_device_too_big(pool->md_dev); | ||
2448 | DMINFO("%s: growing the metadata device from %llu to %llu blocks", | ||
2449 | dm_device_name(pool->pool_md), | ||
2450 | sb_metadata_dev_size, metadata_dev_size); | ||
2234 | r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); | 2451 | r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); |
2235 | if (r) { | 2452 | if (r) { |
2236 | DMERR("%s: failed to resize metadata device", | 2453 | metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); |
2237 | dm_device_name(pool->pool_md)); | ||
2238 | return r; | 2454 | return r; |
2239 | } | 2455 | } |
2240 | 2456 | ||
@@ -2290,8 +2506,7 @@ static void pool_resume(struct dm_target *ti) | |||
2290 | unsigned long flags; | 2506 | unsigned long flags; |
2291 | 2507 | ||
2292 | spin_lock_irqsave(&pool->lock, flags); | 2508 | spin_lock_irqsave(&pool->lock, flags); |
2293 | pool->low_water_triggered = 0; | 2509 | pool->low_water_triggered = false; |
2294 | pool->no_free_space = 0; | ||
2295 | __requeue_bios(pool); | 2510 | __requeue_bios(pool); |
2296 | spin_unlock_irqrestore(&pool->lock, flags); | 2511 | spin_unlock_irqrestore(&pool->lock, flags); |
2297 | 2512 | ||
@@ -2510,7 +2725,8 @@ static void emit_flags(struct pool_features *pf, char *result, | |||
2510 | unsigned sz, unsigned maxlen) | 2725 | unsigned sz, unsigned maxlen) |
2511 | { | 2726 | { |
2512 | unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + | 2727 | unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + |
2513 | !pf->discard_passdown + (pf->mode == PM_READ_ONLY); | 2728 | !pf->discard_passdown + (pf->mode == PM_READ_ONLY) + |
2729 | pf->error_if_no_space; | ||
2514 | DMEMIT("%u ", count); | 2730 | DMEMIT("%u ", count); |
2515 | 2731 | ||
2516 | if (!pf->zero_new_blocks) | 2732 | if (!pf->zero_new_blocks) |
@@ -2524,6 +2740,9 @@ static void emit_flags(struct pool_features *pf, char *result, | |||
2524 | 2740 | ||
2525 | if (pf->mode == PM_READ_ONLY) | 2741 | if (pf->mode == PM_READ_ONLY) |
2526 | DMEMIT("read_only "); | 2742 | DMEMIT("read_only "); |
2743 | |||
2744 | if (pf->error_if_no_space) | ||
2745 | DMEMIT("error_if_no_space "); | ||
2527 | } | 2746 | } |
2528 | 2747 | ||
2529 | /* | 2748 | /* |
@@ -2612,17 +2831,24 @@ static void pool_status(struct dm_target *ti, status_type_t type, | |||
2612 | else | 2831 | else |
2613 | DMEMIT("- "); | 2832 | DMEMIT("- "); |
2614 | 2833 | ||
2615 | if (pool->pf.mode == PM_READ_ONLY) | 2834 | if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) |
2835 | DMEMIT("out_of_data_space "); | ||
2836 | else if (pool->pf.mode == PM_READ_ONLY) | ||
2616 | DMEMIT("ro "); | 2837 | DMEMIT("ro "); |
2617 | else | 2838 | else |
2618 | DMEMIT("rw "); | 2839 | DMEMIT("rw "); |
2619 | 2840 | ||
2620 | if (!pool->pf.discard_enabled) | 2841 | if (!pool->pf.discard_enabled) |
2621 | DMEMIT("ignore_discard"); | 2842 | DMEMIT("ignore_discard "); |
2622 | else if (pool->pf.discard_passdown) | 2843 | else if (pool->pf.discard_passdown) |
2623 | DMEMIT("discard_passdown"); | 2844 | DMEMIT("discard_passdown "); |
2624 | else | 2845 | else |
2625 | DMEMIT("no_discard_passdown"); | 2846 | DMEMIT("no_discard_passdown "); |
2847 | |||
2848 | if (pool->pf.error_if_no_space) | ||
2849 | DMEMIT("error_if_no_space "); | ||
2850 | else | ||
2851 | DMEMIT("queue_if_no_space "); | ||
2626 | 2852 | ||
2627 | break; | 2853 | break; |
2628 | 2854 | ||
@@ -2721,7 +2947,7 @@ static struct target_type pool_target = { | |||
2721 | .name = "thin-pool", | 2947 | .name = "thin-pool", |
2722 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 2948 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
2723 | DM_TARGET_IMMUTABLE, | 2949 | DM_TARGET_IMMUTABLE, |
2724 | .version = {1, 9, 0}, | 2950 | .version = {1, 11, 0}, |
2725 | .module = THIS_MODULE, | 2951 | .module = THIS_MODULE, |
2726 | .ctr = pool_ctr, | 2952 | .ctr = pool_ctr, |
2727 | .dtr = pool_dtr, | 2953 | .dtr = pool_dtr, |
@@ -2828,6 +3054,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2828 | 3054 | ||
2829 | if (get_pool_mode(tc->pool) == PM_FAIL) { | 3055 | if (get_pool_mode(tc->pool) == PM_FAIL) { |
2830 | ti->error = "Couldn't open thin device, Pool is in fail mode"; | 3056 | ti->error = "Couldn't open thin device, Pool is in fail mode"; |
3057 | r = -EINVAL; | ||
2831 | goto bad_thin_open; | 3058 | goto bad_thin_open; |
2832 | } | 3059 | } |
2833 | 3060 | ||
@@ -2839,7 +3066,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2839 | 3066 | ||
2840 | r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); | 3067 | r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); |
2841 | if (r) | 3068 | if (r) |
2842 | goto bad_thin_open; | 3069 | goto bad_target_max_io_len; |
2843 | 3070 | ||
2844 | ti->num_flush_bios = 1; | 3071 | ti->num_flush_bios = 1; |
2845 | ti->flush_supported = true; | 3072 | ti->flush_supported = true; |
@@ -2860,6 +3087,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2860 | 3087 | ||
2861 | return 0; | 3088 | return 0; |
2862 | 3089 | ||
3090 | bad_target_max_io_len: | ||
3091 | dm_pool_close_thin_device(tc->td); | ||
2863 | bad_thin_open: | 3092 | bad_thin_open: |
2864 | __pool_dec(tc->pool); | 3093 | __pool_dec(tc->pool); |
2865 | bad_pool_lookup: | 3094 | bad_pool_lookup: |
@@ -2879,7 +3108,7 @@ out_unlock: | |||
2879 | 3108 | ||
2880 | static int thin_map(struct dm_target *ti, struct bio *bio) | 3109 | static int thin_map(struct dm_target *ti, struct bio *bio) |
2881 | { | 3110 | { |
2882 | bio->bi_sector = dm_target_offset(ti, bio->bi_sector); | 3111 | bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); |
2883 | 3112 | ||
2884 | return thin_bio_map(ti, bio); | 3113 | return thin_bio_map(ti, bio); |
2885 | } | 3114 | } |
@@ -2899,7 +3128,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) | |||
2899 | spin_lock_irqsave(&pool->lock, flags); | 3128 | spin_lock_irqsave(&pool->lock, flags); |
2900 | list_for_each_entry_safe(m, tmp, &work, list) { | 3129 | list_for_each_entry_safe(m, tmp, &work, list) { |
2901 | list_del(&m->list); | 3130 | list_del(&m->list); |
2902 | m->quiesced = 1; | 3131 | m->quiesced = true; |
2903 | __maybe_add_mapping(m); | 3132 | __maybe_add_mapping(m); |
2904 | } | 3133 | } |
2905 | spin_unlock_irqrestore(&pool->lock, flags); | 3134 | spin_unlock_irqrestore(&pool->lock, flags); |
@@ -2911,7 +3140,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) | |||
2911 | if (!list_empty(&work)) { | 3140 | if (!list_empty(&work)) { |
2912 | spin_lock_irqsave(&pool->lock, flags); | 3141 | spin_lock_irqsave(&pool->lock, flags); |
2913 | list_for_each_entry_safe(m, tmp, &work, list) | 3142 | list_for_each_entry_safe(m, tmp, &work, list) |
2914 | list_add(&m->list, &pool->prepared_discards); | 3143 | list_add_tail(&m->list, &pool->prepared_discards); |
2915 | spin_unlock_irqrestore(&pool->lock, flags); | 3144 | spin_unlock_irqrestore(&pool->lock, flags); |
2916 | wake_worker(pool); | 3145 | wake_worker(pool); |
2917 | } | 3146 | } |
@@ -2920,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) | |||
2920 | return 0; | 3149 | return 0; |
2921 | } | 3150 | } |
2922 | 3151 | ||
2923 | static void thin_postsuspend(struct dm_target *ti) | 3152 | static void thin_presuspend(struct dm_target *ti) |
2924 | { | 3153 | { |
3154 | struct thin_c *tc = ti->private; | ||
3155 | |||
2925 | if (dm_noflush_suspending(ti)) | 3156 | if (dm_noflush_suspending(ti)) |
2926 | requeue_io((struct thin_c *)ti->private); | 3157 | noflush_work(tc, do_noflush_start); |
3158 | } | ||
3159 | |||
3160 | static void thin_postsuspend(struct dm_target *ti) | ||
3161 | { | ||
3162 | struct thin_c *tc = ti->private; | ||
3163 | |||
3164 | /* | ||
3165 | * The dm_noflush_suspending flag has been cleared by now, so | ||
3166 | * unfortunately we must always run this. | ||
3167 | */ | ||
3168 | noflush_work(tc, do_noflush_stop); | ||
2927 | } | 3169 | } |
2928 | 3170 | ||
2929 | /* | 3171 | /* |
@@ -3008,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
3008 | 3250 | ||
3009 | static struct target_type thin_target = { | 3251 | static struct target_type thin_target = { |
3010 | .name = "thin", | 3252 | .name = "thin", |
3011 | .version = {1, 9, 0}, | 3253 | .version = {1, 11, 0}, |
3012 | .module = THIS_MODULE, | 3254 | .module = THIS_MODULE, |
3013 | .ctr = thin_ctr, | 3255 | .ctr = thin_ctr, |
3014 | .dtr = thin_dtr, | 3256 | .dtr = thin_dtr, |
3015 | .map = thin_map, | 3257 | .map = thin_map, |
3016 | .end_io = thin_endio, | 3258 | .end_io = thin_endio, |
3259 | .presuspend = thin_presuspend, | ||
3017 | .postsuspend = thin_postsuspend, | 3260 | .postsuspend = thin_postsuspend, |
3018 | .status = thin_status, | 3261 | .status = thin_status, |
3019 | .iterate_devices = thin_iterate_devices, | 3262 | .iterate_devices = thin_iterate_devices, |
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 4b7941db3aff..796007a5e0e1 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c | |||
@@ -73,15 +73,10 @@ struct dm_verity_io { | |||
73 | sector_t block; | 73 | sector_t block; |
74 | unsigned n_blocks; | 74 | unsigned n_blocks; |
75 | 75 | ||
76 | /* saved bio vector */ | 76 | struct bvec_iter iter; |
77 | struct bio_vec *io_vec; | ||
78 | unsigned io_vec_size; | ||
79 | 77 | ||
80 | struct work_struct work; | 78 | struct work_struct work; |
81 | 79 | ||
82 | /* A space for short vectors; longer vectors are allocated separately. */ | ||
83 | struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE]; | ||
84 | |||
85 | /* | 80 | /* |
86 | * Three variably-size fields follow this struct: | 81 | * Three variably-size fields follow this struct: |
87 | * | 82 | * |
@@ -284,9 +279,10 @@ release_ret_r: | |||
284 | static int verity_verify_io(struct dm_verity_io *io) | 279 | static int verity_verify_io(struct dm_verity_io *io) |
285 | { | 280 | { |
286 | struct dm_verity *v = io->v; | 281 | struct dm_verity *v = io->v; |
282 | struct bio *bio = dm_bio_from_per_bio_data(io, | ||
283 | v->ti->per_bio_data_size); | ||
287 | unsigned b; | 284 | unsigned b; |
288 | int i; | 285 | int i; |
289 | unsigned vector = 0, offset = 0; | ||
290 | 286 | ||
291 | for (b = 0; b < io->n_blocks; b++) { | 287 | for (b = 0; b < io->n_blocks; b++) { |
292 | struct shash_desc *desc; | 288 | struct shash_desc *desc; |
@@ -336,31 +332,22 @@ test_block_hash: | |||
336 | } | 332 | } |
337 | 333 | ||
338 | todo = 1 << v->data_dev_block_bits; | 334 | todo = 1 << v->data_dev_block_bits; |
339 | do { | 335 | while (io->iter.bi_size) { |
340 | struct bio_vec *bv; | ||
341 | u8 *page; | 336 | u8 *page; |
342 | unsigned len; | 337 | struct bio_vec bv = bio_iter_iovec(bio, io->iter); |
343 | 338 | ||
344 | BUG_ON(vector >= io->io_vec_size); | 339 | page = kmap_atomic(bv.bv_page); |
345 | bv = &io->io_vec[vector]; | 340 | r = crypto_shash_update(desc, page + bv.bv_offset, |
346 | page = kmap_atomic(bv->bv_page); | 341 | bv.bv_len); |
347 | len = bv->bv_len - offset; | ||
348 | if (likely(len >= todo)) | ||
349 | len = todo; | ||
350 | r = crypto_shash_update(desc, | ||
351 | page + bv->bv_offset + offset, len); | ||
352 | kunmap_atomic(page); | 342 | kunmap_atomic(page); |
343 | |||
353 | if (r < 0) { | 344 | if (r < 0) { |
354 | DMERR("crypto_shash_update failed: %d", r); | 345 | DMERR("crypto_shash_update failed: %d", r); |
355 | return r; | 346 | return r; |
356 | } | 347 | } |
357 | offset += len; | 348 | |
358 | if (likely(offset == bv->bv_len)) { | 349 | bio_advance_iter(bio, &io->iter, bv.bv_len); |
359 | offset = 0; | 350 | } |
360 | vector++; | ||
361 | } | ||
362 | todo -= len; | ||
363 | } while (todo); | ||
364 | 351 | ||
365 | if (!v->version) { | 352 | if (!v->version) { |
366 | r = crypto_shash_update(desc, v->salt, v->salt_size); | 353 | r = crypto_shash_update(desc, v->salt, v->salt_size); |
@@ -383,8 +370,6 @@ test_block_hash: | |||
383 | return -EIO; | 370 | return -EIO; |
384 | } | 371 | } |
385 | } | 372 | } |
386 | BUG_ON(vector != io->io_vec_size); | ||
387 | BUG_ON(offset); | ||
388 | 373 | ||
389 | return 0; | 374 | return 0; |
390 | } | 375 | } |
@@ -400,10 +385,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error) | |||
400 | bio->bi_end_io = io->orig_bi_end_io; | 385 | bio->bi_end_io = io->orig_bi_end_io; |
401 | bio->bi_private = io->orig_bi_private; | 386 | bio->bi_private = io->orig_bi_private; |
402 | 387 | ||
403 | if (io->io_vec != io->io_vec_inline) | 388 | bio_endio_nodec(bio, error); |
404 | mempool_free(io->io_vec, v->vec_mempool); | ||
405 | |||
406 | bio_endio(bio, error); | ||
407 | } | 389 | } |
408 | 390 | ||
409 | static void verity_work(struct work_struct *w) | 391 | static void verity_work(struct work_struct *w) |
@@ -493,9 +475,9 @@ static int verity_map(struct dm_target *ti, struct bio *bio) | |||
493 | struct dm_verity_io *io; | 475 | struct dm_verity_io *io; |
494 | 476 | ||
495 | bio->bi_bdev = v->data_dev->bdev; | 477 | bio->bi_bdev = v->data_dev->bdev; |
496 | bio->bi_sector = verity_map_sector(v, bio->bi_sector); | 478 | bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector); |
497 | 479 | ||
498 | if (((unsigned)bio->bi_sector | bio_sectors(bio)) & | 480 | if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & |
499 | ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { | 481 | ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { |
500 | DMERR_LIMIT("unaligned io"); | 482 | DMERR_LIMIT("unaligned io"); |
501 | return -EIO; | 483 | return -EIO; |
@@ -514,18 +496,12 @@ static int verity_map(struct dm_target *ti, struct bio *bio) | |||
514 | io->v = v; | 496 | io->v = v; |
515 | io->orig_bi_end_io = bio->bi_end_io; | 497 | io->orig_bi_end_io = bio->bi_end_io; |
516 | io->orig_bi_private = bio->bi_private; | 498 | io->orig_bi_private = bio->bi_private; |
517 | io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); | 499 | io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); |
518 | io->n_blocks = bio->bi_size >> v->data_dev_block_bits; | 500 | io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits; |
519 | 501 | ||
520 | bio->bi_end_io = verity_end_io; | 502 | bio->bi_end_io = verity_end_io; |
521 | bio->bi_private = io; | 503 | bio->bi_private = io; |
522 | io->io_vec_size = bio_segments(bio); | 504 | io->iter = bio->bi_iter; |
523 | if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) | ||
524 | io->io_vec = io->io_vec_inline; | ||
525 | else | ||
526 | io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO); | ||
527 | memcpy(io->io_vec, bio_iovec(bio), | ||
528 | io->io_vec_size * sizeof(struct bio_vec)); | ||
529 | 505 | ||
530 | verity_submit_prefetch(v, io); | 506 | verity_submit_prefetch(v, io); |
531 | 507 | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0704c523a76b..8c53b09b9a2c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -200,8 +200,8 @@ struct mapped_device { | |||
200 | /* forced geometry settings */ | 200 | /* forced geometry settings */ |
201 | struct hd_geometry geometry; | 201 | struct hd_geometry geometry; |
202 | 202 | ||
203 | /* sysfs handle */ | 203 | /* kobject and completion */ |
204 | struct kobject kobj; | 204 | struct dm_kobject_holder kobj_holder; |
205 | 205 | ||
206 | /* zero-length flush that will be cloned and submitted to targets */ | 206 | /* zero-length flush that will be cloned and submitted to targets */ |
207 | struct bio flush_bio; | 207 | struct bio flush_bio; |
@@ -575,7 +575,7 @@ static void start_io_acct(struct dm_io *io) | |||
575 | atomic_inc_return(&md->pending[rw])); | 575 | atomic_inc_return(&md->pending[rw])); |
576 | 576 | ||
577 | if (unlikely(dm_stats_used(&md->stats))) | 577 | if (unlikely(dm_stats_used(&md->stats))) |
578 | dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, | 578 | dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, |
579 | bio_sectors(bio), false, 0, &io->stats_aux); | 579 | bio_sectors(bio), false, 0, &io->stats_aux); |
580 | } | 580 | } |
581 | 581 | ||
@@ -593,7 +593,7 @@ static void end_io_acct(struct dm_io *io) | |||
593 | part_stat_unlock(); | 593 | part_stat_unlock(); |
594 | 594 | ||
595 | if (unlikely(dm_stats_used(&md->stats))) | 595 | if (unlikely(dm_stats_used(&md->stats))) |
596 | dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, | 596 | dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, |
597 | bio_sectors(bio), true, duration, &io->stats_aux); | 597 | bio_sectors(bio), true, duration, &io->stats_aux); |
598 | 598 | ||
599 | /* | 599 | /* |
@@ -742,7 +742,7 @@ static void dec_pending(struct dm_io *io, int error) | |||
742 | if (io_error == DM_ENDIO_REQUEUE) | 742 | if (io_error == DM_ENDIO_REQUEUE) |
743 | return; | 743 | return; |
744 | 744 | ||
745 | if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { | 745 | if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { |
746 | /* | 746 | /* |
747 | * Preflush done for flush with data, reissue | 747 | * Preflush done for flush with data, reissue |
748 | * without REQ_FLUSH. | 748 | * without REQ_FLUSH. |
@@ -797,7 +797,7 @@ static void end_clone_bio(struct bio *clone, int error) | |||
797 | struct dm_rq_clone_bio_info *info = clone->bi_private; | 797 | struct dm_rq_clone_bio_info *info = clone->bi_private; |
798 | struct dm_rq_target_io *tio = info->tio; | 798 | struct dm_rq_target_io *tio = info->tio; |
799 | struct bio *bio = info->orig; | 799 | struct bio *bio = info->orig; |
800 | unsigned int nr_bytes = info->orig->bi_size; | 800 | unsigned int nr_bytes = info->orig->bi_iter.bi_size; |
801 | 801 | ||
802 | bio_put(clone); | 802 | bio_put(clone); |
803 | 803 | ||
@@ -1128,7 +1128,7 @@ static void __map_bio(struct dm_target_io *tio) | |||
1128 | * this io. | 1128 | * this io. |
1129 | */ | 1129 | */ |
1130 | atomic_inc(&tio->io->io_count); | 1130 | atomic_inc(&tio->io->io_count); |
1131 | sector = clone->bi_sector; | 1131 | sector = clone->bi_iter.bi_sector; |
1132 | r = ti->type->map(ti, clone); | 1132 | r = ti->type->map(ti, clone); |
1133 | if (r == DM_MAPIO_REMAPPED) { | 1133 | if (r == DM_MAPIO_REMAPPED) { |
1134 | /* the bio has been remapped so dispatch it */ | 1134 | /* the bio has been remapped so dispatch it */ |
@@ -1155,76 +1155,32 @@ struct clone_info { | |||
1155 | struct dm_io *io; | 1155 | struct dm_io *io; |
1156 | sector_t sector; | 1156 | sector_t sector; |
1157 | sector_t sector_count; | 1157 | sector_t sector_count; |
1158 | unsigned short idx; | ||
1159 | }; | 1158 | }; |
1160 | 1159 | ||
1161 | static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) | 1160 | static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) |
1162 | { | 1161 | { |
1163 | bio->bi_sector = sector; | 1162 | bio->bi_iter.bi_sector = sector; |
1164 | bio->bi_size = to_bytes(len); | 1163 | bio->bi_iter.bi_size = to_bytes(len); |
1165 | } | ||
1166 | |||
1167 | static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count) | ||
1168 | { | ||
1169 | bio->bi_idx = idx; | ||
1170 | bio->bi_vcnt = idx + bv_count; | ||
1171 | bio->bi_flags &= ~(1 << BIO_SEG_VALID); | ||
1172 | } | ||
1173 | |||
1174 | static void clone_bio_integrity(struct bio *bio, struct bio *clone, | ||
1175 | unsigned short idx, unsigned len, unsigned offset, | ||
1176 | unsigned trim) | ||
1177 | { | ||
1178 | if (!bio_integrity(bio)) | ||
1179 | return; | ||
1180 | |||
1181 | bio_integrity_clone(clone, bio, GFP_NOIO); | ||
1182 | |||
1183 | if (trim) | ||
1184 | bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); | ||
1185 | } | ||
1186 | |||
1187 | /* | ||
1188 | * Creates a little bio that just does part of a bvec. | ||
1189 | */ | ||
1190 | static void clone_split_bio(struct dm_target_io *tio, struct bio *bio, | ||
1191 | sector_t sector, unsigned short idx, | ||
1192 | unsigned offset, unsigned len) | ||
1193 | { | ||
1194 | struct bio *clone = &tio->clone; | ||
1195 | struct bio_vec *bv = bio->bi_io_vec + idx; | ||
1196 | |||
1197 | *clone->bi_io_vec = *bv; | ||
1198 | |||
1199 | bio_setup_sector(clone, sector, len); | ||
1200 | |||
1201 | clone->bi_bdev = bio->bi_bdev; | ||
1202 | clone->bi_rw = bio->bi_rw; | ||
1203 | clone->bi_vcnt = 1; | ||
1204 | clone->bi_io_vec->bv_offset = offset; | ||
1205 | clone->bi_io_vec->bv_len = clone->bi_size; | ||
1206 | clone->bi_flags |= 1 << BIO_CLONED; | ||
1207 | |||
1208 | clone_bio_integrity(bio, clone, idx, len, offset, 1); | ||
1209 | } | 1164 | } |
1210 | 1165 | ||
1211 | /* | 1166 | /* |
1212 | * Creates a bio that consists of range of complete bvecs. | 1167 | * Creates a bio that consists of range of complete bvecs. |
1213 | */ | 1168 | */ |
1214 | static void clone_bio(struct dm_target_io *tio, struct bio *bio, | 1169 | static void clone_bio(struct dm_target_io *tio, struct bio *bio, |
1215 | sector_t sector, unsigned short idx, | 1170 | sector_t sector, unsigned len) |
1216 | unsigned short bv_count, unsigned len) | ||
1217 | { | 1171 | { |
1218 | struct bio *clone = &tio->clone; | 1172 | struct bio *clone = &tio->clone; |
1219 | unsigned trim = 0; | ||
1220 | 1173 | ||
1221 | __bio_clone(clone, bio); | 1174 | __bio_clone_fast(clone, bio); |
1222 | bio_setup_sector(clone, sector, len); | 1175 | |
1223 | bio_setup_bv(clone, idx, bv_count); | 1176 | if (bio_integrity(bio)) |
1177 | bio_integrity_clone(clone, bio, GFP_NOIO); | ||
1224 | 1178 | ||
1225 | if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) | 1179 | bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); |
1226 | trim = 1; | 1180 | clone->bi_iter.bi_size = to_bytes(len); |
1227 | clone_bio_integrity(bio, clone, idx, len, 0, trim); | 1181 | |
1182 | if (bio_integrity(bio)) | ||
1183 | bio_integrity_trim(clone, 0, len); | ||
1228 | } | 1184 | } |
1229 | 1185 | ||
1230 | static struct dm_target_io *alloc_tio(struct clone_info *ci, | 1186 | static struct dm_target_io *alloc_tio(struct clone_info *ci, |
@@ -1257,7 +1213,7 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, | |||
1257 | * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush | 1213 | * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush |
1258 | * and discard, so no need for concern about wasted bvec allocations. | 1214 | * and discard, so no need for concern about wasted bvec allocations. |
1259 | */ | 1215 | */ |
1260 | __bio_clone(clone, ci->bio); | 1216 | __bio_clone_fast(clone, ci->bio); |
1261 | if (len) | 1217 | if (len) |
1262 | bio_setup_sector(clone, ci->sector, len); | 1218 | bio_setup_sector(clone, ci->sector, len); |
1263 | 1219 | ||
@@ -1286,10 +1242,7 @@ static int __send_empty_flush(struct clone_info *ci) | |||
1286 | } | 1242 | } |
1287 | 1243 | ||
1288 | static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, | 1244 | static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, |
1289 | sector_t sector, int nr_iovecs, | 1245 | sector_t sector, unsigned len) |
1290 | unsigned short idx, unsigned short bv_count, | ||
1291 | unsigned offset, unsigned len, | ||
1292 | unsigned split_bvec) | ||
1293 | { | 1246 | { |
1294 | struct bio *bio = ci->bio; | 1247 | struct bio *bio = ci->bio; |
1295 | struct dm_target_io *tio; | 1248 | struct dm_target_io *tio; |
@@ -1303,11 +1256,8 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti | |||
1303 | num_target_bios = ti->num_write_bios(ti, bio); | 1256 | num_target_bios = ti->num_write_bios(ti, bio); |
1304 | 1257 | ||
1305 | for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { | 1258 | for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { |
1306 | tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr); | 1259 | tio = alloc_tio(ci, ti, 0, target_bio_nr); |
1307 | if (split_bvec) | 1260 | clone_bio(tio, bio, sector, len); |
1308 | clone_split_bio(tio, bio, sector, idx, offset, len); | ||
1309 | else | ||
1310 | clone_bio(tio, bio, sector, idx, bv_count, len); | ||
1311 | __map_bio(tio); | 1261 | __map_bio(tio); |
1312 | } | 1262 | } |
1313 | } | 1263 | } |
@@ -1379,68 +1329,13 @@ static int __send_write_same(struct clone_info *ci) | |||
1379 | } | 1329 | } |
1380 | 1330 | ||
1381 | /* | 1331 | /* |
1382 | * Find maximum number of sectors / bvecs we can process with a single bio. | ||
1383 | */ | ||
1384 | static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx) | ||
1385 | { | ||
1386 | struct bio *bio = ci->bio; | ||
1387 | sector_t bv_len, total_len = 0; | ||
1388 | |||
1389 | for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) { | ||
1390 | bv_len = to_sector(bio->bi_io_vec[*idx].bv_len); | ||
1391 | |||
1392 | if (bv_len > max) | ||
1393 | break; | ||
1394 | |||
1395 | max -= bv_len; | ||
1396 | total_len += bv_len; | ||
1397 | } | ||
1398 | |||
1399 | return total_len; | ||
1400 | } | ||
1401 | |||
1402 | static int __split_bvec_across_targets(struct clone_info *ci, | ||
1403 | struct dm_target *ti, sector_t max) | ||
1404 | { | ||
1405 | struct bio *bio = ci->bio; | ||
1406 | struct bio_vec *bv = bio->bi_io_vec + ci->idx; | ||
1407 | sector_t remaining = to_sector(bv->bv_len); | ||
1408 | unsigned offset = 0; | ||
1409 | sector_t len; | ||
1410 | |||
1411 | do { | ||
1412 | if (offset) { | ||
1413 | ti = dm_table_find_target(ci->map, ci->sector); | ||
1414 | if (!dm_target_is_valid(ti)) | ||
1415 | return -EIO; | ||
1416 | |||
1417 | max = max_io_len(ci->sector, ti); | ||
1418 | } | ||
1419 | |||
1420 | len = min(remaining, max); | ||
1421 | |||
1422 | __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0, | ||
1423 | bv->bv_offset + offset, len, 1); | ||
1424 | |||
1425 | ci->sector += len; | ||
1426 | ci->sector_count -= len; | ||
1427 | offset += to_bytes(len); | ||
1428 | } while (remaining -= len); | ||
1429 | |||
1430 | ci->idx++; | ||
1431 | |||
1432 | return 0; | ||
1433 | } | ||
1434 | |||
1435 | /* | ||
1436 | * Select the correct strategy for processing a non-flush bio. | 1332 | * Select the correct strategy for processing a non-flush bio. |
1437 | */ | 1333 | */ |
1438 | static int __split_and_process_non_flush(struct clone_info *ci) | 1334 | static int __split_and_process_non_flush(struct clone_info *ci) |
1439 | { | 1335 | { |
1440 | struct bio *bio = ci->bio; | 1336 | struct bio *bio = ci->bio; |
1441 | struct dm_target *ti; | 1337 | struct dm_target *ti; |
1442 | sector_t len, max; | 1338 | unsigned len; |
1443 | int idx; | ||
1444 | 1339 | ||
1445 | if (unlikely(bio->bi_rw & REQ_DISCARD)) | 1340 | if (unlikely(bio->bi_rw & REQ_DISCARD)) |
1446 | return __send_discard(ci); | 1341 | return __send_discard(ci); |
@@ -1451,41 +1346,14 @@ static int __split_and_process_non_flush(struct clone_info *ci) | |||
1451 | if (!dm_target_is_valid(ti)) | 1346 | if (!dm_target_is_valid(ti)) |
1452 | return -EIO; | 1347 | return -EIO; |
1453 | 1348 | ||
1454 | max = max_io_len(ci->sector, ti); | 1349 | len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); |
1455 | |||
1456 | /* | ||
1457 | * Optimise for the simple case where we can do all of | ||
1458 | * the remaining io with a single clone. | ||
1459 | */ | ||
1460 | if (ci->sector_count <= max) { | ||
1461 | __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, | ||
1462 | ci->idx, bio->bi_vcnt - ci->idx, 0, | ||
1463 | ci->sector_count, 0); | ||
1464 | ci->sector_count = 0; | ||
1465 | return 0; | ||
1466 | } | ||
1467 | 1350 | ||
1468 | /* | 1351 | __clone_and_map_data_bio(ci, ti, ci->sector, len); |
1469 | * There are some bvecs that don't span targets. | ||
1470 | * Do as many of these as possible. | ||
1471 | */ | ||
1472 | if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { | ||
1473 | len = __len_within_target(ci, max, &idx); | ||
1474 | |||
1475 | __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, | ||
1476 | ci->idx, idx - ci->idx, 0, len, 0); | ||
1477 | 1352 | ||
1478 | ci->sector += len; | 1353 | ci->sector += len; |
1479 | ci->sector_count -= len; | 1354 | ci->sector_count -= len; |
1480 | ci->idx = idx; | ||
1481 | 1355 | ||
1482 | return 0; | 1356 | return 0; |
1483 | } | ||
1484 | |||
1485 | /* | ||
1486 | * Handle a bvec that must be split between two or more targets. | ||
1487 | */ | ||
1488 | return __split_bvec_across_targets(ci, ti, max); | ||
1489 | } | 1357 | } |
1490 | 1358 | ||
1491 | /* | 1359 | /* |
@@ -1510,8 +1378,7 @@ static void __split_and_process_bio(struct mapped_device *md, | |||
1510 | ci.io->bio = bio; | 1378 | ci.io->bio = bio; |
1511 | ci.io->md = md; | 1379 | ci.io->md = md; |
1512 | spin_lock_init(&ci.io->endio_lock); | 1380 | spin_lock_init(&ci.io->endio_lock); |
1513 | ci.sector = bio->bi_sector; | 1381 | ci.sector = bio->bi_iter.bi_sector; |
1514 | ci.idx = bio->bi_idx; | ||
1515 | 1382 | ||
1516 | start_io_acct(ci.io); | 1383 | start_io_acct(ci.io); |
1517 | 1384 | ||
@@ -2041,6 +1908,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
2041 | init_waitqueue_head(&md->wait); | 1908 | init_waitqueue_head(&md->wait); |
2042 | INIT_WORK(&md->work, dm_wq_work); | 1909 | INIT_WORK(&md->work, dm_wq_work); |
2043 | init_waitqueue_head(&md->eventq); | 1910 | init_waitqueue_head(&md->eventq); |
1911 | init_completion(&md->kobj_holder.completion); | ||
2044 | 1912 | ||
2045 | md->disk->major = _major; | 1913 | md->disk->major = _major; |
2046 | md->disk->first_minor = minor; | 1914 | md->disk->first_minor = minor; |
@@ -2902,20 +2770,14 @@ struct gendisk *dm_disk(struct mapped_device *md) | |||
2902 | 2770 | ||
2903 | struct kobject *dm_kobject(struct mapped_device *md) | 2771 | struct kobject *dm_kobject(struct mapped_device *md) |
2904 | { | 2772 | { |
2905 | return &md->kobj; | 2773 | return &md->kobj_holder.kobj; |
2906 | } | 2774 | } |
2907 | 2775 | ||
2908 | /* | ||
2909 | * struct mapped_device should not be exported outside of dm.c | ||
2910 | * so use this check to verify that kobj is part of md structure | ||
2911 | */ | ||
2912 | struct mapped_device *dm_get_from_kobject(struct kobject *kobj) | 2776 | struct mapped_device *dm_get_from_kobject(struct kobject *kobj) |
2913 | { | 2777 | { |
2914 | struct mapped_device *md; | 2778 | struct mapped_device *md; |
2915 | 2779 | ||
2916 | md = container_of(kobj, struct mapped_device, kobj); | 2780 | md = container_of(kobj, struct mapped_device, kobj_holder.kobj); |
2917 | if (&md->kobj != kobj) | ||
2918 | return NULL; | ||
2919 | 2781 | ||
2920 | if (test_bit(DMF_FREEING, &md->flags) || | 2782 | if (test_bit(DMF_FREEING, &md->flags) || |
2921 | dm_deleting_md(md)) | 2783 | dm_deleting_md(md)) |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index c57ba550f69e..c4569f02f50f 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -15,6 +15,8 @@ | |||
15 | #include <linux/list.h> | 15 | #include <linux/list.h> |
16 | #include <linux/blkdev.h> | 16 | #include <linux/blkdev.h> |
17 | #include <linux/hdreg.h> | 17 | #include <linux/hdreg.h> |
18 | #include <linux/completion.h> | ||
19 | #include <linux/kobject.h> | ||
18 | 20 | ||
19 | #include "dm-stats.h" | 21 | #include "dm-stats.h" |
20 | 22 | ||
@@ -148,12 +150,27 @@ void dm_interface_exit(void); | |||
148 | /* | 150 | /* |
149 | * sysfs interface | 151 | * sysfs interface |
150 | */ | 152 | */ |
153 | struct dm_kobject_holder { | ||
154 | struct kobject kobj; | ||
155 | struct completion completion; | ||
156 | }; | ||
157 | |||
158 | static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) | ||
159 | { | ||
160 | return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; | ||
161 | } | ||
162 | |||
151 | int dm_sysfs_init(struct mapped_device *md); | 163 | int dm_sysfs_init(struct mapped_device *md); |
152 | void dm_sysfs_exit(struct mapped_device *md); | 164 | void dm_sysfs_exit(struct mapped_device *md); |
153 | struct kobject *dm_kobject(struct mapped_device *md); | 165 | struct kobject *dm_kobject(struct mapped_device *md); |
154 | struct mapped_device *dm_get_from_kobject(struct kobject *kobj); | 166 | struct mapped_device *dm_get_from_kobject(struct kobject *kobj); |
155 | 167 | ||
156 | /* | 168 | /* |
169 | * The kobject helper | ||
170 | */ | ||
171 | void dm_kobject_release(struct kobject *kobj); | ||
172 | |||
173 | /* | ||
157 | * Targets for linear and striped mappings | 174 | * Targets for linear and striped mappings |
158 | */ | 175 | */ |
159 | int dm_linear_init(void); | 176 | int dm_linear_init(void); |
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 3193aefe982b..e8b4574956c7 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
@@ -74,8 +74,8 @@ static void faulty_fail(struct bio *bio, int error) | |||
74 | { | 74 | { |
75 | struct bio *b = bio->bi_private; | 75 | struct bio *b = bio->bi_private; |
76 | 76 | ||
77 | b->bi_size = bio->bi_size; | 77 | b->bi_iter.bi_size = bio->bi_iter.bi_size; |
78 | b->bi_sector = bio->bi_sector; | 78 | b->bi_iter.bi_sector = bio->bi_iter.bi_sector; |
79 | 79 | ||
80 | bio_put(bio); | 80 | bio_put(bio); |
81 | 81 | ||
@@ -185,26 +185,31 @@ static void make_request(struct mddev *mddev, struct bio *bio) | |||
185 | return; | 185 | return; |
186 | } | 186 | } |
187 | 187 | ||
188 | if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE)) | 188 | if (check_sector(conf, bio->bi_iter.bi_sector, |
189 | bio_end_sector(bio), WRITE)) | ||
189 | failit = 1; | 190 | failit = 1; |
190 | if (check_mode(conf, WritePersistent)) { | 191 | if (check_mode(conf, WritePersistent)) { |
191 | add_sector(conf, bio->bi_sector, WritePersistent); | 192 | add_sector(conf, bio->bi_iter.bi_sector, |
193 | WritePersistent); | ||
192 | failit = 1; | 194 | failit = 1; |
193 | } | 195 | } |
194 | if (check_mode(conf, WriteTransient)) | 196 | if (check_mode(conf, WriteTransient)) |
195 | failit = 1; | 197 | failit = 1; |
196 | } else { | 198 | } else { |
197 | /* read request */ | 199 | /* read request */ |
198 | if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ)) | 200 | if (check_sector(conf, bio->bi_iter.bi_sector, |
201 | bio_end_sector(bio), READ)) | ||
199 | failit = 1; | 202 | failit = 1; |
200 | if (check_mode(conf, ReadTransient)) | 203 | if (check_mode(conf, ReadTransient)) |
201 | failit = 1; | 204 | failit = 1; |
202 | if (check_mode(conf, ReadPersistent)) { | 205 | if (check_mode(conf, ReadPersistent)) { |
203 | add_sector(conf, bio->bi_sector, ReadPersistent); | 206 | add_sector(conf, bio->bi_iter.bi_sector, |
207 | ReadPersistent); | ||
204 | failit = 1; | 208 | failit = 1; |
205 | } | 209 | } |
206 | if (check_mode(conf, ReadFixable)) { | 210 | if (check_mode(conf, ReadFixable)) { |
207 | add_sector(conf, bio->bi_sector, ReadFixable); | 211 | add_sector(conf, bio->bi_iter.bi_sector, |
212 | ReadFixable); | ||
208 | failit = 1; | 213 | failit = 1; |
209 | } | 214 | } |
210 | } | 215 | } |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index f03fabd2b37b..56f534b4a2d2 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -288,65 +288,65 @@ static int linear_stop (struct mddev *mddev) | |||
288 | 288 | ||
289 | static void linear_make_request(struct mddev *mddev, struct bio *bio) | 289 | static void linear_make_request(struct mddev *mddev, struct bio *bio) |
290 | { | 290 | { |
291 | char b[BDEVNAME_SIZE]; | ||
291 | struct dev_info *tmp_dev; | 292 | struct dev_info *tmp_dev; |
292 | sector_t start_sector; | 293 | struct bio *split; |
294 | sector_t start_sector, end_sector, data_offset; | ||
293 | 295 | ||
294 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 296 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
295 | md_flush_request(mddev, bio); | 297 | md_flush_request(mddev, bio); |
296 | return; | 298 | return; |
297 | } | 299 | } |
298 | 300 | ||
299 | rcu_read_lock(); | 301 | do { |
300 | tmp_dev = which_dev(mddev, bio->bi_sector); | 302 | rcu_read_lock(); |
301 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; | ||
302 | |||
303 | |||
304 | if (unlikely(bio->bi_sector >= (tmp_dev->end_sector) | ||
305 | || (bio->bi_sector < start_sector))) { | ||
306 | char b[BDEVNAME_SIZE]; | ||
307 | |||
308 | printk(KERN_ERR | ||
309 | "md/linear:%s: make_request: Sector %llu out of bounds on " | ||
310 | "dev %s: %llu sectors, offset %llu\n", | ||
311 | mdname(mddev), | ||
312 | (unsigned long long)bio->bi_sector, | ||
313 | bdevname(tmp_dev->rdev->bdev, b), | ||
314 | (unsigned long long)tmp_dev->rdev->sectors, | ||
315 | (unsigned long long)start_sector); | ||
316 | rcu_read_unlock(); | ||
317 | bio_io_error(bio); | ||
318 | return; | ||
319 | } | ||
320 | if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) { | ||
321 | /* This bio crosses a device boundary, so we have to | ||
322 | * split it. | ||
323 | */ | ||
324 | struct bio_pair *bp; | ||
325 | sector_t end_sector = tmp_dev->end_sector; | ||
326 | 303 | ||
327 | rcu_read_unlock(); | 304 | tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); |
328 | 305 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; | |
329 | bp = bio_split(bio, end_sector - bio->bi_sector); | 306 | end_sector = tmp_dev->end_sector; |
307 | data_offset = tmp_dev->rdev->data_offset; | ||
308 | bio->bi_bdev = tmp_dev->rdev->bdev; | ||
330 | 309 | ||
331 | linear_make_request(mddev, &bp->bio1); | 310 | rcu_read_unlock(); |
332 | linear_make_request(mddev, &bp->bio2); | ||
333 | bio_pair_release(bp); | ||
334 | return; | ||
335 | } | ||
336 | |||
337 | bio->bi_bdev = tmp_dev->rdev->bdev; | ||
338 | bio->bi_sector = bio->bi_sector - start_sector | ||
339 | + tmp_dev->rdev->data_offset; | ||
340 | rcu_read_unlock(); | ||
341 | 311 | ||
342 | if (unlikely((bio->bi_rw & REQ_DISCARD) && | 312 | if (unlikely(bio->bi_iter.bi_sector >= end_sector || |
343 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { | 313 | bio->bi_iter.bi_sector < start_sector)) |
344 | /* Just ignore it */ | 314 | goto out_of_bounds; |
345 | bio_endio(bio, 0); | 315 | |
346 | return; | 316 | if (unlikely(bio_end_sector(bio) > end_sector)) { |
347 | } | 317 | /* This bio crosses a device boundary, so we have to |
318 | * split it. | ||
319 | */ | ||
320 | split = bio_split(bio, end_sector - | ||
321 | bio->bi_iter.bi_sector, | ||
322 | GFP_NOIO, fs_bio_set); | ||
323 | bio_chain(split, bio); | ||
324 | } else { | ||
325 | split = bio; | ||
326 | } | ||
348 | 327 | ||
349 | generic_make_request(bio); | 328 | split->bi_iter.bi_sector = split->bi_iter.bi_sector - |
329 | start_sector + data_offset; | ||
330 | |||
331 | if (unlikely((split->bi_rw & REQ_DISCARD) && | ||
332 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { | ||
333 | /* Just ignore it */ | ||
334 | bio_endio(split, 0); | ||
335 | } else | ||
336 | generic_make_request(split); | ||
337 | } while (split != bio); | ||
338 | return; | ||
339 | |||
340 | out_of_bounds: | ||
341 | printk(KERN_ERR | ||
342 | "md/linear:%s: make_request: Sector %llu out of bounds on " | ||
343 | "dev %s: %llu sectors, offset %llu\n", | ||
344 | mdname(mddev), | ||
345 | (unsigned long long)bio->bi_iter.bi_sector, | ||
346 | bdevname(tmp_dev->rdev->bdev, b), | ||
347 | (unsigned long long)tmp_dev->rdev->sectors, | ||
348 | (unsigned long long)start_sector); | ||
349 | bio_io_error(bio); | ||
350 | } | 350 | } |
351 | 351 | ||
352 | static void linear_status (struct seq_file *seq, struct mddev *mddev) | 352 | static void linear_status (struct seq_file *seq, struct mddev *mddev) |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 21f4d7ff0da2..4ad5cc4e63e8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -393,7 +393,7 @@ static void md_submit_flush_data(struct work_struct *ws) | |||
393 | struct mddev *mddev = container_of(ws, struct mddev, flush_work); | 393 | struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
394 | struct bio *bio = mddev->flush_bio; | 394 | struct bio *bio = mddev->flush_bio; |
395 | 395 | ||
396 | if (bio->bi_size == 0) | 396 | if (bio->bi_iter.bi_size == 0) |
397 | /* an empty barrier - all done */ | 397 | /* an empty barrier - all done */ |
398 | bio_endio(bio, 0); | 398 | bio_endio(bio, 0); |
399 | else { | 399 | else { |
@@ -754,7 +754,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | |||
754 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); | 754 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); |
755 | 755 | ||
756 | bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; | 756 | bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; |
757 | bio->bi_sector = sector; | 757 | bio->bi_iter.bi_sector = sector; |
758 | bio_add_page(bio, page, size, 0); | 758 | bio_add_page(bio, page, size, 0); |
759 | bio->bi_private = rdev; | 759 | bio->bi_private = rdev; |
760 | bio->bi_end_io = super_written; | 760 | bio->bi_end_io = super_written; |
@@ -782,18 +782,16 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | |||
782 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); | 782 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); |
783 | int ret; | 783 | int ret; |
784 | 784 | ||
785 | rw |= REQ_SYNC; | ||
786 | |||
787 | bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? | 785 | bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? |
788 | rdev->meta_bdev : rdev->bdev; | 786 | rdev->meta_bdev : rdev->bdev; |
789 | if (metadata_op) | 787 | if (metadata_op) |
790 | bio->bi_sector = sector + rdev->sb_start; | 788 | bio->bi_iter.bi_sector = sector + rdev->sb_start; |
791 | else if (rdev->mddev->reshape_position != MaxSector && | 789 | else if (rdev->mddev->reshape_position != MaxSector && |
792 | (rdev->mddev->reshape_backwards == | 790 | (rdev->mddev->reshape_backwards == |
793 | (sector >= rdev->mddev->reshape_position))) | 791 | (sector >= rdev->mddev->reshape_position))) |
794 | bio->bi_sector = sector + rdev->new_data_offset; | 792 | bio->bi_iter.bi_sector = sector + rdev->new_data_offset; |
795 | else | 793 | else |
796 | bio->bi_sector = sector + rdev->data_offset; | 794 | bio->bi_iter.bi_sector = sector + rdev->data_offset; |
797 | bio_add_page(bio, page, size, 0); | 795 | bio_add_page(bio, page, size, 0); |
798 | submit_bio_wait(rw, bio); | 796 | submit_bio_wait(rw, bio); |
799 | 797 | ||
@@ -1077,6 +1075,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1077 | rdev->raid_disk = -1; | 1075 | rdev->raid_disk = -1; |
1078 | clear_bit(Faulty, &rdev->flags); | 1076 | clear_bit(Faulty, &rdev->flags); |
1079 | clear_bit(In_sync, &rdev->flags); | 1077 | clear_bit(In_sync, &rdev->flags); |
1078 | clear_bit(Bitmap_sync, &rdev->flags); | ||
1080 | clear_bit(WriteMostly, &rdev->flags); | 1079 | clear_bit(WriteMostly, &rdev->flags); |
1081 | 1080 | ||
1082 | if (mddev->raid_disks == 0) { | 1081 | if (mddev->raid_disks == 0) { |
@@ -1155,6 +1154,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1155 | */ | 1154 | */ |
1156 | if (ev1 < mddev->bitmap->events_cleared) | 1155 | if (ev1 < mddev->bitmap->events_cleared) |
1157 | return 0; | 1156 | return 0; |
1157 | if (ev1 < mddev->events) | ||
1158 | set_bit(Bitmap_sync, &rdev->flags); | ||
1158 | } else { | 1159 | } else { |
1159 | if (ev1 < mddev->events) | 1160 | if (ev1 < mddev->events) |
1160 | /* just a hot-add of a new device, leave raid_disk at -1 */ | 1161 | /* just a hot-add of a new device, leave raid_disk at -1 */ |
@@ -1170,6 +1171,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1170 | desc->raid_disk < mddev->raid_disks */) { | 1171 | desc->raid_disk < mddev->raid_disks */) { |
1171 | set_bit(In_sync, &rdev->flags); | 1172 | set_bit(In_sync, &rdev->flags); |
1172 | rdev->raid_disk = desc->raid_disk; | 1173 | rdev->raid_disk = desc->raid_disk; |
1174 | rdev->saved_raid_disk = desc->raid_disk; | ||
1173 | } else if (desc->state & (1<<MD_DISK_ACTIVE)) { | 1175 | } else if (desc->state & (1<<MD_DISK_ACTIVE)) { |
1174 | /* active but not in sync implies recovery up to | 1176 | /* active but not in sync implies recovery up to |
1175 | * reshape position. We don't know exactly where | 1177 | * reshape position. We don't know exactly where |
@@ -1563,6 +1565,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1563 | rdev->raid_disk = -1; | 1565 | rdev->raid_disk = -1; |
1564 | clear_bit(Faulty, &rdev->flags); | 1566 | clear_bit(Faulty, &rdev->flags); |
1565 | clear_bit(In_sync, &rdev->flags); | 1567 | clear_bit(In_sync, &rdev->flags); |
1568 | clear_bit(Bitmap_sync, &rdev->flags); | ||
1566 | clear_bit(WriteMostly, &rdev->flags); | 1569 | clear_bit(WriteMostly, &rdev->flags); |
1567 | 1570 | ||
1568 | if (mddev->raid_disks == 0) { | 1571 | if (mddev->raid_disks == 0) { |
@@ -1645,6 +1648,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1645 | */ | 1648 | */ |
1646 | if (ev1 < mddev->bitmap->events_cleared) | 1649 | if (ev1 < mddev->bitmap->events_cleared) |
1647 | return 0; | 1650 | return 0; |
1651 | if (ev1 < mddev->events) | ||
1652 | set_bit(Bitmap_sync, &rdev->flags); | ||
1648 | } else { | 1653 | } else { |
1649 | if (ev1 < mddev->events) | 1654 | if (ev1 < mddev->events) |
1650 | /* just a hot-add of a new device, leave raid_disk at -1 */ | 1655 | /* just a hot-add of a new device, leave raid_disk at -1 */ |
@@ -1665,10 +1670,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1665 | set_bit(Faulty, &rdev->flags); | 1670 | set_bit(Faulty, &rdev->flags); |
1666 | break; | 1671 | break; |
1667 | default: | 1672 | default: |
1673 | rdev->saved_raid_disk = role; | ||
1668 | if ((le32_to_cpu(sb->feature_map) & | 1674 | if ((le32_to_cpu(sb->feature_map) & |
1669 | MD_FEATURE_RECOVERY_OFFSET)) | 1675 | MD_FEATURE_RECOVERY_OFFSET)) { |
1670 | rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); | 1676 | rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); |
1671 | else | 1677 | if (!(le32_to_cpu(sb->feature_map) & |
1678 | MD_FEATURE_RECOVERY_BITMAP)) | ||
1679 | rdev->saved_raid_disk = -1; | ||
1680 | } else | ||
1672 | set_bit(In_sync, &rdev->flags); | 1681 | set_bit(In_sync, &rdev->flags); |
1673 | rdev->raid_disk = role; | 1682 | rdev->raid_disk = role; |
1674 | break; | 1683 | break; |
@@ -1730,6 +1739,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1730 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); | 1739 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); |
1731 | sb->recovery_offset = | 1740 | sb->recovery_offset = |
1732 | cpu_to_le64(rdev->recovery_offset); | 1741 | cpu_to_le64(rdev->recovery_offset); |
1742 | if (rdev->saved_raid_disk >= 0 && mddev->bitmap) | ||
1743 | sb->feature_map |= | ||
1744 | cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); | ||
1733 | } | 1745 | } |
1734 | if (test_bit(Replacement, &rdev->flags)) | 1746 | if (test_bit(Replacement, &rdev->flags)) |
1735 | sb->feature_map |= | 1747 | sb->feature_map |= |
@@ -2471,8 +2483,7 @@ repeat: | |||
2471 | if (rdev->sb_loaded != 1) | 2483 | if (rdev->sb_loaded != 1) |
2472 | continue; /* no noise on spare devices */ | 2484 | continue; /* no noise on spare devices */ |
2473 | 2485 | ||
2474 | if (!test_bit(Faulty, &rdev->flags) && | 2486 | if (!test_bit(Faulty, &rdev->flags)) { |
2475 | rdev->saved_raid_disk == -1) { | ||
2476 | md_super_write(mddev,rdev, | 2487 | md_super_write(mddev,rdev, |
2477 | rdev->sb_start, rdev->sb_size, | 2488 | rdev->sb_start, rdev->sb_size, |
2478 | rdev->sb_page); | 2489 | rdev->sb_page); |
@@ -2488,11 +2499,9 @@ repeat: | |||
2488 | rdev->badblocks.size = 0; | 2499 | rdev->badblocks.size = 0; |
2489 | } | 2500 | } |
2490 | 2501 | ||
2491 | } else if (test_bit(Faulty, &rdev->flags)) | 2502 | } else |
2492 | pr_debug("md: %s (skipping faulty)\n", | 2503 | pr_debug("md: %s (skipping faulty)\n", |
2493 | bdevname(rdev->bdev, b)); | 2504 | bdevname(rdev->bdev, b)); |
2494 | else | ||
2495 | pr_debug("(skipping incremental s/r "); | ||
2496 | 2505 | ||
2497 | if (mddev->level == LEVEL_MULTIPATH) | 2506 | if (mddev->level == LEVEL_MULTIPATH) |
2498 | /* only need to write one superblock... */ | 2507 | /* only need to write one superblock... */ |
@@ -2608,6 +2617,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2608 | * blocked - sets the Blocked flags | 2617 | * blocked - sets the Blocked flags |
2609 | * -blocked - clears the Blocked and possibly simulates an error | 2618 | * -blocked - clears the Blocked and possibly simulates an error |
2610 | * insync - sets Insync providing device isn't active | 2619 | * insync - sets Insync providing device isn't active |
2620 | * -insync - clear Insync for a device with a slot assigned, | ||
2621 | * so that it gets rebuilt based on bitmap | ||
2611 | * write_error - sets WriteErrorSeen | 2622 | * write_error - sets WriteErrorSeen |
2612 | * -write_error - clears WriteErrorSeen | 2623 | * -write_error - clears WriteErrorSeen |
2613 | */ | 2624 | */ |
@@ -2656,6 +2667,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2656 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { | 2667 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
2657 | set_bit(In_sync, &rdev->flags); | 2668 | set_bit(In_sync, &rdev->flags); |
2658 | err = 0; | 2669 | err = 0; |
2670 | } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { | ||
2671 | clear_bit(In_sync, &rdev->flags); | ||
2672 | rdev->saved_raid_disk = rdev->raid_disk; | ||
2673 | rdev->raid_disk = -1; | ||
2674 | err = 0; | ||
2659 | } else if (cmd_match(buf, "write_error")) { | 2675 | } else if (cmd_match(buf, "write_error")) { |
2660 | set_bit(WriteErrorSeen, &rdev->flags); | 2676 | set_bit(WriteErrorSeen, &rdev->flags); |
2661 | err = 0; | 2677 | err = 0; |
@@ -2788,6 +2804,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2788 | else | 2804 | else |
2789 | rdev->saved_raid_disk = -1; | 2805 | rdev->saved_raid_disk = -1; |
2790 | clear_bit(In_sync, &rdev->flags); | 2806 | clear_bit(In_sync, &rdev->flags); |
2807 | clear_bit(Bitmap_sync, &rdev->flags); | ||
2791 | err = rdev->mddev->pers-> | 2808 | err = rdev->mddev->pers-> |
2792 | hot_add_disk(rdev->mddev, rdev); | 2809 | hot_add_disk(rdev->mddev, rdev); |
2793 | if (err) { | 2810 | if (err) { |
@@ -3582,6 +3599,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3582 | pers->run(mddev); | 3599 | pers->run(mddev); |
3583 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3600 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
3584 | mddev_resume(mddev); | 3601 | mddev_resume(mddev); |
3602 | if (!mddev->thread) | ||
3603 | md_update_sb(mddev, 1); | ||
3585 | sysfs_notify(&mddev->kobj, NULL, "level"); | 3604 | sysfs_notify(&mddev->kobj, NULL, "level"); |
3586 | md_new_event(mddev); | 3605 | md_new_event(mddev); |
3587 | return rv; | 3606 | return rv; |
@@ -5760,8 +5779,10 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) | |||
5760 | info->raid_disk < mddev->raid_disks) { | 5779 | info->raid_disk < mddev->raid_disks) { |
5761 | rdev->raid_disk = info->raid_disk; | 5780 | rdev->raid_disk = info->raid_disk; |
5762 | set_bit(In_sync, &rdev->flags); | 5781 | set_bit(In_sync, &rdev->flags); |
5782 | clear_bit(Bitmap_sync, &rdev->flags); | ||
5763 | } else | 5783 | } else |
5764 | rdev->raid_disk = -1; | 5784 | rdev->raid_disk = -1; |
5785 | rdev->saved_raid_disk = rdev->raid_disk; | ||
5765 | } else | 5786 | } else |
5766 | super_types[mddev->major_version]. | 5787 | super_types[mddev->major_version]. |
5767 | validate_super(mddev, rdev); | 5788 | validate_super(mddev, rdev); |
@@ -5774,11 +5795,6 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) | |||
5774 | return -EINVAL; | 5795 | return -EINVAL; |
5775 | } | 5796 | } |
5776 | 5797 | ||
5777 | if (test_bit(In_sync, &rdev->flags)) | ||
5778 | rdev->saved_raid_disk = rdev->raid_disk; | ||
5779 | else | ||
5780 | rdev->saved_raid_disk = -1; | ||
5781 | |||
5782 | clear_bit(In_sync, &rdev->flags); /* just to be sure */ | 5798 | clear_bit(In_sync, &rdev->flags); /* just to be sure */ |
5783 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | 5799 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
5784 | set_bit(WriteMostly, &rdev->flags); | 5800 | set_bit(WriteMostly, &rdev->flags); |
@@ -6328,6 +6344,32 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |||
6328 | return 0; | 6344 | return 0; |
6329 | } | 6345 | } |
6330 | 6346 | ||
6347 | static inline bool md_ioctl_valid(unsigned int cmd) | ||
6348 | { | ||
6349 | switch (cmd) { | ||
6350 | case ADD_NEW_DISK: | ||
6351 | case BLKROSET: | ||
6352 | case GET_ARRAY_INFO: | ||
6353 | case GET_BITMAP_FILE: | ||
6354 | case GET_DISK_INFO: | ||
6355 | case HOT_ADD_DISK: | ||
6356 | case HOT_REMOVE_DISK: | ||
6357 | case PRINT_RAID_DEBUG: | ||
6358 | case RAID_AUTORUN: | ||
6359 | case RAID_VERSION: | ||
6360 | case RESTART_ARRAY_RW: | ||
6361 | case RUN_ARRAY: | ||
6362 | case SET_ARRAY_INFO: | ||
6363 | case SET_BITMAP_FILE: | ||
6364 | case SET_DISK_FAULTY: | ||
6365 | case STOP_ARRAY: | ||
6366 | case STOP_ARRAY_RO: | ||
6367 | return true; | ||
6368 | default: | ||
6369 | return false; | ||
6370 | } | ||
6371 | } | ||
6372 | |||
6331 | static int md_ioctl(struct block_device *bdev, fmode_t mode, | 6373 | static int md_ioctl(struct block_device *bdev, fmode_t mode, |
6332 | unsigned int cmd, unsigned long arg) | 6374 | unsigned int cmd, unsigned long arg) |
6333 | { | 6375 | { |
@@ -6336,6 +6378,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6336 | struct mddev *mddev = NULL; | 6378 | struct mddev *mddev = NULL; |
6337 | int ro; | 6379 | int ro; |
6338 | 6380 | ||
6381 | if (!md_ioctl_valid(cmd)) | ||
6382 | return -ENOTTY; | ||
6383 | |||
6339 | switch (cmd) { | 6384 | switch (cmd) { |
6340 | case RAID_VERSION: | 6385 | case RAID_VERSION: |
6341 | case GET_ARRAY_INFO: | 6386 | case GET_ARRAY_INFO: |
@@ -7706,10 +7751,12 @@ static int remove_and_add_spares(struct mddev *mddev, | |||
7706 | if (test_bit(Faulty, &rdev->flags)) | 7751 | if (test_bit(Faulty, &rdev->flags)) |
7707 | continue; | 7752 | continue; |
7708 | if (mddev->ro && | 7753 | if (mddev->ro && |
7709 | rdev->saved_raid_disk < 0) | 7754 | ! (rdev->saved_raid_disk >= 0 && |
7755 | !test_bit(Bitmap_sync, &rdev->flags))) | ||
7710 | continue; | 7756 | continue; |
7711 | 7757 | ||
7712 | rdev->recovery_offset = 0; | 7758 | if (rdev->saved_raid_disk < 0) |
7759 | rdev->recovery_offset = 0; | ||
7713 | if (mddev->pers-> | 7760 | if (mddev->pers-> |
7714 | hot_add_disk(mddev, rdev) == 0) { | 7761 | hot_add_disk(mddev, rdev) == 0) { |
7715 | if (sysfs_link_rdev(mddev, rdev)) | 7762 | if (sysfs_link_rdev(mddev, rdev)) |
@@ -7787,9 +7834,12 @@ void md_check_recovery(struct mddev *mddev) | |||
7787 | * As we only add devices that are already in-sync, | 7834 | * As we only add devices that are already in-sync, |
7788 | * we can activate the spares immediately. | 7835 | * we can activate the spares immediately. |
7789 | */ | 7836 | */ |
7790 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
7791 | remove_and_add_spares(mddev, NULL); | 7837 | remove_and_add_spares(mddev, NULL); |
7792 | mddev->pers->spare_active(mddev); | 7838 | /* There is no thread, but we need to call |
7839 | * ->spare_active and clear saved_raid_disk | ||
7840 | */ | ||
7841 | md_reap_sync_thread(mddev); | ||
7842 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
7793 | goto unlock; | 7843 | goto unlock; |
7794 | } | 7844 | } |
7795 | 7845 | ||
@@ -7926,14 +7976,10 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
7926 | mddev->pers->finish_reshape(mddev); | 7976 | mddev->pers->finish_reshape(mddev); |
7927 | 7977 | ||
7928 | /* If array is no-longer degraded, then any saved_raid_disk | 7978 | /* If array is no-longer degraded, then any saved_raid_disk |
7929 | * information must be scrapped. Also if any device is now | 7979 | * information must be scrapped. |
7930 | * In_sync we must scrape the saved_raid_disk for that device | ||
7931 | * do the superblock for an incrementally recovered device | ||
7932 | * written out. | ||
7933 | */ | 7980 | */ |
7934 | rdev_for_each(rdev, mddev) | 7981 | if (!mddev->degraded) |
7935 | if (!mddev->degraded || | 7982 | rdev_for_each(rdev, mddev) |
7936 | test_bit(In_sync, &rdev->flags)) | ||
7937 | rdev->saved_raid_disk = -1; | 7983 | rdev->saved_raid_disk = -1; |
7938 | 7984 | ||
7939 | md_update_sb(mddev, 1); | 7985 | md_update_sb(mddev, 1); |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 2f5cc8a7ef3e..07bba96de260 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -106,7 +106,7 @@ struct md_rdev { | |||
106 | */ | 106 | */ |
107 | struct work_struct del_work; /* used for delayed sysfs removal */ | 107 | struct work_struct del_work; /* used for delayed sysfs removal */ |
108 | 108 | ||
109 | struct sysfs_dirent *sysfs_state; /* handle for 'state' | 109 | struct kernfs_node *sysfs_state; /* handle for 'state' |
110 | * sysfs entry */ | 110 | * sysfs entry */ |
111 | 111 | ||
112 | struct badblocks { | 112 | struct badblocks { |
@@ -129,6 +129,9 @@ struct md_rdev { | |||
129 | enum flag_bits { | 129 | enum flag_bits { |
130 | Faulty, /* device is known to have a fault */ | 130 | Faulty, /* device is known to have a fault */ |
131 | In_sync, /* device is in_sync with rest of array */ | 131 | In_sync, /* device is in_sync with rest of array */ |
132 | Bitmap_sync, /* ..actually, not quite In_sync. Need a | ||
133 | * bitmap-based recovery to get fully in sync | ||
134 | */ | ||
132 | Unmerged, /* device is being added to array and should | 135 | Unmerged, /* device is being added to array and should |
133 | * be considerred for bvec_merge_fn but not | 136 | * be considerred for bvec_merge_fn but not |
134 | * yet for actual IO | 137 | * yet for actual IO |
@@ -376,10 +379,10 @@ struct mddev { | |||
376 | sector_t resync_max; /* resync should pause | 379 | sector_t resync_max; /* resync should pause |
377 | * when it gets here */ | 380 | * when it gets here */ |
378 | 381 | ||
379 | struct sysfs_dirent *sysfs_state; /* handle for 'array_state' | 382 | struct kernfs_node *sysfs_state; /* handle for 'array_state' |
380 | * file in sysfs. | 383 | * file in sysfs. |
381 | */ | 384 | */ |
382 | struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ | 385 | struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ |
383 | 386 | ||
384 | struct work_struct del_work; /* used for delayed sysfs removal */ | 387 | struct work_struct del_work; /* used for delayed sysfs removal */ |
385 | 388 | ||
@@ -498,13 +501,13 @@ struct md_sysfs_entry { | |||
498 | }; | 501 | }; |
499 | extern struct attribute_group md_bitmap_group; | 502 | extern struct attribute_group md_bitmap_group; |
500 | 503 | ||
501 | static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) | 504 | static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) |
502 | { | 505 | { |
503 | if (sd) | 506 | if (sd) |
504 | return sysfs_get_dirent(sd, name); | 507 | return sysfs_get_dirent(sd, name); |
505 | return sd; | 508 | return sd; |
506 | } | 509 | } |
507 | static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) | 510 | static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd) |
508 | { | 511 | { |
509 | if (sd) | 512 | if (sd) |
510 | sysfs_notify_dirent(sd); | 513 | sysfs_notify_dirent(sd); |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 1642eae75a33..849ad39f547b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -100,7 +100,7 @@ static void multipath_end_request(struct bio *bio, int error) | |||
100 | md_error (mp_bh->mddev, rdev); | 100 | md_error (mp_bh->mddev, rdev); |
101 | printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", | 101 | printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", |
102 | bdevname(rdev->bdev,b), | 102 | bdevname(rdev->bdev,b), |
103 | (unsigned long long)bio->bi_sector); | 103 | (unsigned long long)bio->bi_iter.bi_sector); |
104 | multipath_reschedule_retry(mp_bh); | 104 | multipath_reschedule_retry(mp_bh); |
105 | } else | 105 | } else |
106 | multipath_end_bh_io(mp_bh, error); | 106 | multipath_end_bh_io(mp_bh, error); |
@@ -132,7 +132,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) | |||
132 | multipath = conf->multipaths + mp_bh->path; | 132 | multipath = conf->multipaths + mp_bh->path; |
133 | 133 | ||
134 | mp_bh->bio = *bio; | 134 | mp_bh->bio = *bio; |
135 | mp_bh->bio.bi_sector += multipath->rdev->data_offset; | 135 | mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; |
136 | mp_bh->bio.bi_bdev = multipath->rdev->bdev; | 136 | mp_bh->bio.bi_bdev = multipath->rdev->bdev; |
137 | mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; | 137 | mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; |
138 | mp_bh->bio.bi_end_io = multipath_end_request; | 138 | mp_bh->bio.bi_end_io = multipath_end_request; |
@@ -355,21 +355,22 @@ static void multipathd(struct md_thread *thread) | |||
355 | spin_unlock_irqrestore(&conf->device_lock, flags); | 355 | spin_unlock_irqrestore(&conf->device_lock, flags); |
356 | 356 | ||
357 | bio = &mp_bh->bio; | 357 | bio = &mp_bh->bio; |
358 | bio->bi_sector = mp_bh->master_bio->bi_sector; | 358 | bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; |
359 | 359 | ||
360 | if ((mp_bh->path = multipath_map (conf))<0) { | 360 | if ((mp_bh->path = multipath_map (conf))<0) { |
361 | printk(KERN_ALERT "multipath: %s: unrecoverable IO read" | 361 | printk(KERN_ALERT "multipath: %s: unrecoverable IO read" |
362 | " error for block %llu\n", | 362 | " error for block %llu\n", |
363 | bdevname(bio->bi_bdev,b), | 363 | bdevname(bio->bi_bdev,b), |
364 | (unsigned long long)bio->bi_sector); | 364 | (unsigned long long)bio->bi_iter.bi_sector); |
365 | multipath_end_bh_io(mp_bh, -EIO); | 365 | multipath_end_bh_io(mp_bh, -EIO); |
366 | } else { | 366 | } else { |
367 | printk(KERN_ERR "multipath: %s: redirecting sector %llu" | 367 | printk(KERN_ERR "multipath: %s: redirecting sector %llu" |
368 | " to another IO path\n", | 368 | " to another IO path\n", |
369 | bdevname(bio->bi_bdev,b), | 369 | bdevname(bio->bi_bdev,b), |
370 | (unsigned long long)bio->bi_sector); | 370 | (unsigned long long)bio->bi_iter.bi_sector); |
371 | *bio = *(mp_bh->master_bio); | 371 | *bio = *(mp_bh->master_bio); |
372 | bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; | 372 | bio->bi_iter.bi_sector += |
373 | conf->multipaths[mp_bh->path].rdev->data_offset; | ||
373 | bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; | 374 | bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; |
374 | bio->bi_rw |= REQ_FAILFAST_TRANSPORT; | 375 | bio->bi_rw |= REQ_FAILFAST_TRANSPORT; |
375 | bio->bi_end_io = multipath_end_request; | 376 | bio->bi_end_io = multipath_end_request; |
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index 19b268795415..0c2dec7aec20 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig | |||
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA | |||
6 | ---help--- | 6 | ---help--- |
7 | Library providing immutable on-disk data structure support for | 7 | Library providing immutable on-disk data structure support for |
8 | device-mapper targets such as the thin provisioning target. | 8 | device-mapper targets such as the thin provisioning target. |
9 | |||
10 | config DM_DEBUG_BLOCK_STACK_TRACING | ||
11 | boolean "Keep stack trace of persistent data block lock holders" | ||
12 | depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA | ||
13 | select STACKTRACE | ||
14 | ---help--- | ||
15 | Enable this for messages that may help debug problems with the | ||
16 | block manager locking used by thin provisioning and caching. | ||
17 | |||
18 | If unsure, say N. | ||
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 064a3c271baa..455f79279a16 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c | |||
@@ -104,7 +104,7 @@ static int __check_holder(struct block_lock *lock) | |||
104 | 104 | ||
105 | for (i = 0; i < MAX_HOLDERS; i++) { | 105 | for (i = 0; i < MAX_HOLDERS; i++) { |
106 | if (lock->holders[i] == current) { | 106 | if (lock->holders[i] == current) { |
107 | DMERR("recursive lock detected in pool metadata"); | 107 | DMERR("recursive lock detected in metadata"); |
108 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING | 108 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
109 | DMERR("previously held here:"); | 109 | DMERR("previously held here:"); |
110 | print_stack_trace(lock->traces + i, 4); | 110 | print_stack_trace(lock->traces + i, 4); |
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 468e371ee9b2..416060c25709 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c | |||
@@ -770,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify); | |||
770 | 770 | ||
771 | /*----------------------------------------------------------------*/ | 771 | /*----------------------------------------------------------------*/ |
772 | 772 | ||
773 | static int find_highest_key(struct ro_spine *s, dm_block_t block, | 773 | static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, |
774 | uint64_t *result_key, dm_block_t *next_block) | 774 | uint64_t *result_key, dm_block_t *next_block) |
775 | { | 775 | { |
776 | int i, r; | 776 | int i, r; |
777 | uint32_t flags; | 777 | uint32_t flags; |
@@ -788,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block, | |||
788 | else | 788 | else |
789 | i--; | 789 | i--; |
790 | 790 | ||
791 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | 791 | if (find_highest) |
792 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | ||
793 | else | ||
794 | *result_key = le64_to_cpu(ro_node(s)->keys[0]); | ||
795 | |||
792 | if (next_block || flags & INTERNAL_NODE) | 796 | if (next_block || flags & INTERNAL_NODE) |
793 | block = value64(ro_node(s), i); | 797 | block = value64(ro_node(s), i); |
794 | 798 | ||
@@ -799,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block, | |||
799 | return 0; | 803 | return 0; |
800 | } | 804 | } |
801 | 805 | ||
802 | int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, | 806 | static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, |
803 | uint64_t *result_keys) | 807 | bool find_highest, uint64_t *result_keys) |
804 | { | 808 | { |
805 | int r = 0, count = 0, level; | 809 | int r = 0, count = 0, level; |
806 | struct ro_spine spine; | 810 | struct ro_spine spine; |
807 | 811 | ||
808 | init_ro_spine(&spine, info); | 812 | init_ro_spine(&spine, info); |
809 | for (level = 0; level < info->levels; level++) { | 813 | for (level = 0; level < info->levels; level++) { |
810 | r = find_highest_key(&spine, root, result_keys + level, | 814 | r = find_key(&spine, root, find_highest, result_keys + level, |
811 | level == info->levels - 1 ? NULL : &root); | 815 | level == info->levels - 1 ? NULL : &root); |
812 | if (r == -ENODATA) { | 816 | if (r == -ENODATA) { |
813 | r = 0; | 817 | r = 0; |
814 | break; | 818 | break; |
@@ -822,8 +826,23 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, | |||
822 | 826 | ||
823 | return r ? r : count; | 827 | return r ? r : count; |
824 | } | 828 | } |
829 | |||
830 | int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, | ||
831 | uint64_t *result_keys) | ||
832 | { | ||
833 | return dm_btree_find_key(info, root, true, result_keys); | ||
834 | } | ||
825 | EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); | 835 | EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); |
826 | 836 | ||
837 | int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, | ||
838 | uint64_t *result_keys) | ||
839 | { | ||
840 | return dm_btree_find_key(info, root, false, result_keys); | ||
841 | } | ||
842 | EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); | ||
843 | |||
844 | /*----------------------------------------------------------------*/ | ||
845 | |||
827 | /* | 846 | /* |
828 | * FIXME: We shouldn't use a recursive algorithm when we have limited stack | 847 | * FIXME: We shouldn't use a recursive algorithm when we have limited stack |
829 | * space. Also this only works for single level trees. | 848 | * space. Also this only works for single level trees. |
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index 8672d159e0b5..dacfc34180b4 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h | |||
@@ -137,6 +137,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, | |||
137 | /* | 137 | /* |
138 | * Returns < 0 on failure. Otherwise the number of key entries that have | 138 | * Returns < 0 on failure. Otherwise the number of key entries that have |
139 | * been filled out. Remember trees can have zero entries, and as such have | 139 | * been filled out. Remember trees can have zero entries, and as such have |
140 | * no lowest key. | ||
141 | */ | ||
142 | int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, | ||
143 | uint64_t *result_keys); | ||
144 | |||
145 | /* | ||
146 | * Returns < 0 on failure. Otherwise the number of key entries that have | ||
147 | * been filled out. Remember trees can have zero entries, and as such have | ||
140 | * no highest key. | 148 | * no highest key. |
141 | */ | 149 | */ |
142 | int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, | 150 | int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, |
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 466a60bbd716..aacbe70c2c2e 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c | |||
@@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) | |||
245 | return -EINVAL; | 245 | return -EINVAL; |
246 | } | 246 | } |
247 | 247 | ||
248 | /* | ||
249 | * We need to set this before the dm_tm_new_block() call below. | ||
250 | */ | ||
251 | ll->nr_blocks = nr_blocks; | ||
248 | for (i = old_blocks; i < blocks; i++) { | 252 | for (i = old_blocks; i < blocks; i++) { |
249 | struct dm_block *b; | 253 | struct dm_block *b; |
250 | struct disk_index_entry idx; | 254 | struct disk_index_entry idx; |
@@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) | |||
252 | r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); | 256 | r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); |
253 | if (r < 0) | 257 | if (r < 0) |
254 | return r; | 258 | return r; |
259 | |||
255 | idx.blocknr = cpu_to_le64(dm_block_location(b)); | 260 | idx.blocknr = cpu_to_le64(dm_block_location(b)); |
256 | 261 | ||
257 | r = dm_tm_unlock(ll->tm, b); | 262 | r = dm_tm_unlock(ll->tm, b); |
@@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) | |||
266 | return r; | 271 | return r; |
267 | } | 272 | } |
268 | 273 | ||
269 | ll->nr_blocks = nr_blocks; | ||
270 | return 0; | 274 | return 0; |
271 | } | 275 | } |
272 | 276 | ||
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 58fc1eef7499..786b689bdfc7 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c | |||
@@ -91,6 +91,69 @@ struct block_op { | |||
91 | dm_block_t block; | 91 | dm_block_t block; |
92 | }; | 92 | }; |
93 | 93 | ||
94 | struct bop_ring_buffer { | ||
95 | unsigned begin; | ||
96 | unsigned end; | ||
97 | struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; | ||
98 | }; | ||
99 | |||
100 | static void brb_init(struct bop_ring_buffer *brb) | ||
101 | { | ||
102 | brb->begin = 0; | ||
103 | brb->end = 0; | ||
104 | } | ||
105 | |||
106 | static bool brb_empty(struct bop_ring_buffer *brb) | ||
107 | { | ||
108 | return brb->begin == brb->end; | ||
109 | } | ||
110 | |||
111 | static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) | ||
112 | { | ||
113 | unsigned r = old + 1; | ||
114 | return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; | ||
115 | } | ||
116 | |||
117 | static int brb_push(struct bop_ring_buffer *brb, | ||
118 | enum block_op_type type, dm_block_t b) | ||
119 | { | ||
120 | struct block_op *bop; | ||
121 | unsigned next = brb_next(brb, brb->end); | ||
122 | |||
123 | /* | ||
124 | * We don't allow the last bop to be filled, this way we can | ||
125 | * differentiate between full and empty. | ||
126 | */ | ||
127 | if (next == brb->begin) | ||
128 | return -ENOMEM; | ||
129 | |||
130 | bop = brb->bops + brb->end; | ||
131 | bop->type = type; | ||
132 | bop->block = b; | ||
133 | |||
134 | brb->end = next; | ||
135 | |||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) | ||
140 | { | ||
141 | struct block_op *bop; | ||
142 | |||
143 | if (brb_empty(brb)) | ||
144 | return -ENODATA; | ||
145 | |||
146 | bop = brb->bops + brb->begin; | ||
147 | result->type = bop->type; | ||
148 | result->block = bop->block; | ||
149 | |||
150 | brb->begin = brb_next(brb, brb->begin); | ||
151 | |||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | /*----------------------------------------------------------------*/ | ||
156 | |||
94 | struct sm_metadata { | 157 | struct sm_metadata { |
95 | struct dm_space_map sm; | 158 | struct dm_space_map sm; |
96 | 159 | ||
@@ -101,25 +164,20 @@ struct sm_metadata { | |||
101 | 164 | ||
102 | unsigned recursion_count; | 165 | unsigned recursion_count; |
103 | unsigned allocated_this_transaction; | 166 | unsigned allocated_this_transaction; |
104 | unsigned nr_uncommitted; | 167 | struct bop_ring_buffer uncommitted; |
105 | struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; | ||
106 | 168 | ||
107 | struct threshold threshold; | 169 | struct threshold threshold; |
108 | }; | 170 | }; |
109 | 171 | ||
110 | static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) | 172 | static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) |
111 | { | 173 | { |
112 | struct block_op *op; | 174 | int r = brb_push(&smm->uncommitted, type, b); |
113 | 175 | ||
114 | if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { | 176 | if (r) { |
115 | DMERR("too many recursive allocations"); | 177 | DMERR("too many recursive allocations"); |
116 | return -ENOMEM; | 178 | return -ENOMEM; |
117 | } | 179 | } |
118 | 180 | ||
119 | op = smm->uncommitted + smm->nr_uncommitted++; | ||
120 | op->type = type; | ||
121 | op->block = b; | ||
122 | |||
123 | return 0; | 181 | return 0; |
124 | } | 182 | } |
125 | 183 | ||
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm) | |||
158 | return -ENOMEM; | 216 | return -ENOMEM; |
159 | } | 217 | } |
160 | 218 | ||
161 | if (smm->recursion_count == 1 && smm->nr_uncommitted) { | 219 | if (smm->recursion_count == 1) { |
162 | while (smm->nr_uncommitted && !r) { | 220 | while (!brb_empty(&smm->uncommitted)) { |
163 | smm->nr_uncommitted--; | 221 | struct block_op bop; |
164 | r = commit_bop(smm, smm->uncommitted + | 222 | |
165 | smm->nr_uncommitted); | 223 | r = brb_pop(&smm->uncommitted, &bop); |
224 | if (r) { | ||
225 | DMERR("bug in bop ring buffer"); | ||
226 | break; | ||
227 | } | ||
228 | |||
229 | r = commit_bop(smm, &bop); | ||
166 | if (r) | 230 | if (r) |
167 | break; | 231 | break; |
168 | } | 232 | } |
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) | |||
217 | static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | 281 | static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, |
218 | uint32_t *result) | 282 | uint32_t *result) |
219 | { | 283 | { |
220 | int r, i; | 284 | int r; |
285 | unsigned i; | ||
221 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); | 286 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); |
222 | unsigned adjustment = 0; | 287 | unsigned adjustment = 0; |
223 | 288 | ||
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | |||
225 | * We may have some uncommitted adjustments to add. This list | 290 | * We may have some uncommitted adjustments to add. This list |
226 | * should always be really short. | 291 | * should always be really short. |
227 | */ | 292 | */ |
228 | for (i = 0; i < smm->nr_uncommitted; i++) { | 293 | for (i = smm->uncommitted.begin; |
229 | struct block_op *op = smm->uncommitted + i; | 294 | i != smm->uncommitted.end; |
295 | i = brb_next(&smm->uncommitted, i)) { | ||
296 | struct block_op *op = smm->uncommitted.bops + i; | ||
230 | 297 | ||
231 | if (op->block != b) | 298 | if (op->block != b) |
232 | continue; | 299 | continue; |
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | |||
254 | static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, | 321 | static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, |
255 | dm_block_t b, int *result) | 322 | dm_block_t b, int *result) |
256 | { | 323 | { |
257 | int r, i, adjustment = 0; | 324 | int r, adjustment = 0; |
325 | unsigned i; | ||
258 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); | 326 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); |
259 | uint32_t rc; | 327 | uint32_t rc; |
260 | 328 | ||
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, | |||
262 | * We may have some uncommitted adjustments to add. This list | 330 | * We may have some uncommitted adjustments to add. This list |
263 | * should always be really short. | 331 | * should always be really short. |
264 | */ | 332 | */ |
265 | for (i = 0; i < smm->nr_uncommitted; i++) { | 333 | for (i = smm->uncommitted.begin; |
266 | struct block_op *op = smm->uncommitted + i; | 334 | i != smm->uncommitted.end; |
335 | i = brb_next(&smm->uncommitted, i)) { | ||
336 | |||
337 | struct block_op *op = smm->uncommitted.bops + i; | ||
267 | 338 | ||
268 | if (op->block != b) | 339 | if (op->block != b) |
269 | continue; | 340 | continue; |
@@ -385,13 +456,13 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) | |||
385 | 456 | ||
386 | int r = sm_metadata_new_block_(sm, b); | 457 | int r = sm_metadata_new_block_(sm, b); |
387 | if (r) { | 458 | if (r) { |
388 | DMERR("unable to allocate new metadata block"); | 459 | DMERR_LIMIT("unable to allocate new metadata block"); |
389 | return r; | 460 | return r; |
390 | } | 461 | } |
391 | 462 | ||
392 | r = sm_metadata_get_nr_free(sm, &count); | 463 | r = sm_metadata_get_nr_free(sm, &count); |
393 | if (r) { | 464 | if (r) { |
394 | DMERR("couldn't get free block count"); | 465 | DMERR_LIMIT("couldn't get free block count"); |
395 | return r; | 466 | return r; |
396 | } | 467 | } |
397 | 468 | ||
@@ -608,20 +679,38 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) | |||
608 | * Flick into a mode where all blocks get allocated in the new area. | 679 | * Flick into a mode where all blocks get allocated in the new area. |
609 | */ | 680 | */ |
610 | smm->begin = old_len; | 681 | smm->begin = old_len; |
611 | memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); | 682 | memcpy(sm, &bootstrap_ops, sizeof(*sm)); |
612 | 683 | ||
613 | /* | 684 | /* |
614 | * Extend. | 685 | * Extend. |
615 | */ | 686 | */ |
616 | r = sm_ll_extend(&smm->ll, extra_blocks); | 687 | r = sm_ll_extend(&smm->ll, extra_blocks); |
688 | if (r) | ||
689 | goto out; | ||
617 | 690 | ||
618 | /* | 691 | /* |
619 | * Switch back to normal behaviour. | 692 | * We repeatedly increment then commit until the commit doesn't |
693 | * allocate any new blocks. | ||
620 | */ | 694 | */ |
621 | memcpy(&smm->sm, &ops, sizeof(smm->sm)); | 695 | do { |
622 | for (i = old_len; !r && i < smm->begin; i++) | 696 | for (i = old_len; !r && i < smm->begin; i++) { |
623 | r = sm_ll_inc(&smm->ll, i, &ev); | 697 | r = sm_ll_inc(&smm->ll, i, &ev); |
698 | if (r) | ||
699 | goto out; | ||
700 | } | ||
701 | old_len = smm->begin; | ||
702 | |||
703 | r = sm_ll_commit(&smm->ll); | ||
704 | if (r) | ||
705 | goto out; | ||
706 | |||
707 | } while (old_len != smm->begin); | ||
624 | 708 | ||
709 | out: | ||
710 | /* | ||
711 | * Switch back to normal behaviour. | ||
712 | */ | ||
713 | memcpy(sm, &ops, sizeof(*sm)); | ||
625 | return r; | 714 | return r; |
626 | } | 715 | } |
627 | 716 | ||
@@ -653,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, | |||
653 | smm->begin = superblock + 1; | 742 | smm->begin = superblock + 1; |
654 | smm->recursion_count = 0; | 743 | smm->recursion_count = 0; |
655 | smm->allocated_this_transaction = 0; | 744 | smm->allocated_this_transaction = 0; |
656 | smm->nr_uncommitted = 0; | 745 | brb_init(&smm->uncommitted); |
657 | threshold_init(&smm->threshold); | 746 | threshold_init(&smm->threshold); |
658 | 747 | ||
659 | memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); | 748 | memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); |
@@ -662,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm, | |||
662 | if (r) | 751 | if (r) |
663 | return r; | 752 | return r; |
664 | 753 | ||
754 | if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) | ||
755 | nr_blocks = DM_SM_METADATA_MAX_BLOCKS; | ||
665 | r = sm_ll_extend(&smm->ll, nr_blocks); | 756 | r = sm_ll_extend(&smm->ll, nr_blocks); |
666 | if (r) | 757 | if (r) |
667 | return r; | 758 | return r; |
@@ -695,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm, | |||
695 | smm->begin = 0; | 786 | smm->begin = 0; |
696 | smm->recursion_count = 0; | 787 | smm->recursion_count = 0; |
697 | smm->allocated_this_transaction = 0; | 788 | smm->allocated_this_transaction = 0; |
698 | smm->nr_uncommitted = 0; | 789 | brb_init(&smm->uncommitted); |
699 | threshold_init(&smm->threshold); | 790 | threshold_init(&smm->threshold); |
700 | 791 | ||
701 | memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); | 792 | memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); |
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h index 39bba0801cf2..64df923974d8 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.h +++ b/drivers/md/persistent-data/dm-space-map-metadata.h | |||
@@ -9,6 +9,17 @@ | |||
9 | 9 | ||
10 | #include "dm-transaction-manager.h" | 10 | #include "dm-transaction-manager.h" |
11 | 11 | ||
12 | #define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT) | ||
13 | |||
14 | /* | ||
15 | * The metadata device is currently limited in size. | ||
16 | * | ||
17 | * We have one block of index, which can hold 255 index entries. Each | ||
18 | * index entry contains allocation info about ~16k metadata blocks. | ||
19 | */ | ||
20 | #define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64)) | ||
21 | #define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE) | ||
22 | |||
12 | /* | 23 | /* |
13 | * Unfortunately we have to use two-phase construction due to the cycle | 24 | * Unfortunately we have to use two-phase construction due to the cycle |
14 | * between the tm and sm. | 25 | * between the tm and sm. |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c4d420b7d2f4..407a99e46f69 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -501,10 +501,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, | |||
501 | unsigned int chunk_sects, struct bio *bio) | 501 | unsigned int chunk_sects, struct bio *bio) |
502 | { | 502 | { |
503 | if (likely(is_power_of_2(chunk_sects))) { | 503 | if (likely(is_power_of_2(chunk_sects))) { |
504 | return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) | 504 | return chunk_sects >= |
505 | ((bio->bi_iter.bi_sector & (chunk_sects-1)) | ||
505 | + bio_sectors(bio)); | 506 | + bio_sectors(bio)); |
506 | } else{ | 507 | } else{ |
507 | sector_t sector = bio->bi_sector; | 508 | sector_t sector = bio->bi_iter.bi_sector; |
508 | return chunk_sects >= (sector_div(sector, chunk_sects) | 509 | return chunk_sects >= (sector_div(sector, chunk_sects) |
509 | + bio_sectors(bio)); | 510 | + bio_sectors(bio)); |
510 | } | 511 | } |
@@ -512,64 +513,44 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, | |||
512 | 513 | ||
513 | static void raid0_make_request(struct mddev *mddev, struct bio *bio) | 514 | static void raid0_make_request(struct mddev *mddev, struct bio *bio) |
514 | { | 515 | { |
515 | unsigned int chunk_sects; | ||
516 | sector_t sector_offset; | ||
517 | struct strip_zone *zone; | 516 | struct strip_zone *zone; |
518 | struct md_rdev *tmp_dev; | 517 | struct md_rdev *tmp_dev; |
518 | struct bio *split; | ||
519 | 519 | ||
520 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 520 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
521 | md_flush_request(mddev, bio); | 521 | md_flush_request(mddev, bio); |
522 | return; | 522 | return; |
523 | } | 523 | } |
524 | 524 | ||
525 | chunk_sects = mddev->chunk_sectors; | 525 | do { |
526 | if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { | 526 | sector_t sector = bio->bi_iter.bi_sector; |
527 | sector_t sector = bio->bi_sector; | 527 | unsigned chunk_sects = mddev->chunk_sectors; |
528 | struct bio_pair *bp; | ||
529 | /* Sanity check -- queue functions should prevent this happening */ | ||
530 | if (bio_segments(bio) > 1) | ||
531 | goto bad_map; | ||
532 | /* This is a one page bio that upper layers | ||
533 | * refuse to split for us, so we need to split it. | ||
534 | */ | ||
535 | if (likely(is_power_of_2(chunk_sects))) | ||
536 | bp = bio_split(bio, chunk_sects - (sector & | ||
537 | (chunk_sects-1))); | ||
538 | else | ||
539 | bp = bio_split(bio, chunk_sects - | ||
540 | sector_div(sector, chunk_sects)); | ||
541 | raid0_make_request(mddev, &bp->bio1); | ||
542 | raid0_make_request(mddev, &bp->bio2); | ||
543 | bio_pair_release(bp); | ||
544 | return; | ||
545 | } | ||
546 | 528 | ||
547 | sector_offset = bio->bi_sector; | 529 | unsigned sectors = chunk_sects - |
548 | zone = find_zone(mddev->private, §or_offset); | 530 | (likely(is_power_of_2(chunk_sects)) |
549 | tmp_dev = map_sector(mddev, zone, bio->bi_sector, | 531 | ? (sector & (chunk_sects-1)) |
550 | §or_offset); | 532 | : sector_div(sector, chunk_sects)); |
551 | bio->bi_bdev = tmp_dev->bdev; | ||
552 | bio->bi_sector = sector_offset + zone->dev_start + | ||
553 | tmp_dev->data_offset; | ||
554 | |||
555 | if (unlikely((bio->bi_rw & REQ_DISCARD) && | ||
556 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { | ||
557 | /* Just ignore it */ | ||
558 | bio_endio(bio, 0); | ||
559 | return; | ||
560 | } | ||
561 | 533 | ||
562 | generic_make_request(bio); | 534 | if (sectors < bio_sectors(bio)) { |
563 | return; | 535 | split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); |
564 | 536 | bio_chain(split, bio); | |
565 | bad_map: | 537 | } else { |
566 | printk("md/raid0:%s: make_request bug: can't convert block across chunks" | 538 | split = bio; |
567 | " or bigger than %dk %llu %d\n", | 539 | } |
568 | mdname(mddev), chunk_sects / 2, | ||
569 | (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); | ||
570 | 540 | ||
571 | bio_io_error(bio); | 541 | zone = find_zone(mddev->private, §or); |
572 | return; | 542 | tmp_dev = map_sector(mddev, zone, sector, §or); |
543 | split->bi_bdev = tmp_dev->bdev; | ||
544 | split->bi_iter.bi_sector = sector + zone->dev_start + | ||
545 | tmp_dev->data_offset; | ||
546 | |||
547 | if (unlikely((split->bi_rw & REQ_DISCARD) && | ||
548 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { | ||
549 | /* Just ignore it */ | ||
550 | bio_endio(split, 0); | ||
551 | } else | ||
552 | generic_make_request(split); | ||
553 | } while (split != bio); | ||
573 | } | 554 | } |
574 | 555 | ||
575 | static void raid0_status(struct seq_file *seq, struct mddev *mddev) | 556 | static void raid0_status(struct seq_file *seq, struct mddev *mddev) |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1e5a540995e9..4a6ca1cb2e78 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -229,7 +229,7 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
229 | int done; | 229 | int done; |
230 | struct r1conf *conf = r1_bio->mddev->private; | 230 | struct r1conf *conf = r1_bio->mddev->private; |
231 | sector_t start_next_window = r1_bio->start_next_window; | 231 | sector_t start_next_window = r1_bio->start_next_window; |
232 | sector_t bi_sector = bio->bi_sector; | 232 | sector_t bi_sector = bio->bi_iter.bi_sector; |
233 | 233 | ||
234 | if (bio->bi_phys_segments) { | 234 | if (bio->bi_phys_segments) { |
235 | unsigned long flags; | 235 | unsigned long flags; |
@@ -265,9 +265,8 @@ static void raid_end_bio_io(struct r1bio *r1_bio) | |||
265 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | 265 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
266 | pr_debug("raid1: sync end %s on sectors %llu-%llu\n", | 266 | pr_debug("raid1: sync end %s on sectors %llu-%llu\n", |
267 | (bio_data_dir(bio) == WRITE) ? "write" : "read", | 267 | (bio_data_dir(bio) == WRITE) ? "write" : "read", |
268 | (unsigned long long) bio->bi_sector, | 268 | (unsigned long long) bio->bi_iter.bi_sector, |
269 | (unsigned long long) bio->bi_sector + | 269 | (unsigned long long) bio_end_sector(bio) - 1); |
270 | bio_sectors(bio) - 1); | ||
271 | 270 | ||
272 | call_bio_endio(r1_bio); | 271 | call_bio_endio(r1_bio); |
273 | } | 272 | } |
@@ -466,9 +465,8 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
466 | struct bio *mbio = r1_bio->master_bio; | 465 | struct bio *mbio = r1_bio->master_bio; |
467 | pr_debug("raid1: behind end write sectors" | 466 | pr_debug("raid1: behind end write sectors" |
468 | " %llu-%llu\n", | 467 | " %llu-%llu\n", |
469 | (unsigned long long) mbio->bi_sector, | 468 | (unsigned long long) mbio->bi_iter.bi_sector, |
470 | (unsigned long long) mbio->bi_sector + | 469 | (unsigned long long) bio_end_sector(mbio) - 1); |
471 | bio_sectors(mbio) - 1); | ||
472 | call_bio_endio(r1_bio); | 470 | call_bio_endio(r1_bio); |
473 | } | 471 | } |
474 | } | 472 | } |
@@ -875,7 +873,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) | |||
875 | else if ((conf->next_resync - RESYNC_WINDOW_SECTORS | 873 | else if ((conf->next_resync - RESYNC_WINDOW_SECTORS |
876 | >= bio_end_sector(bio)) || | 874 | >= bio_end_sector(bio)) || |
877 | (conf->next_resync + NEXT_NORMALIO_DISTANCE | 875 | (conf->next_resync + NEXT_NORMALIO_DISTANCE |
878 | <= bio->bi_sector)) | 876 | <= bio->bi_iter.bi_sector)) |
879 | wait = false; | 877 | wait = false; |
880 | else | 878 | else |
881 | wait = true; | 879 | wait = true; |
@@ -913,20 +911,19 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | |||
913 | 911 | ||
914 | if (bio && bio_data_dir(bio) == WRITE) { | 912 | if (bio && bio_data_dir(bio) == WRITE) { |
915 | if (conf->next_resync + NEXT_NORMALIO_DISTANCE | 913 | if (conf->next_resync + NEXT_NORMALIO_DISTANCE |
916 | <= bio->bi_sector) { | 914 | <= bio->bi_iter.bi_sector) { |
917 | if (conf->start_next_window == MaxSector) | 915 | if (conf->start_next_window == MaxSector) |
918 | conf->start_next_window = | 916 | conf->start_next_window = |
919 | conf->next_resync + | 917 | conf->next_resync + |
920 | NEXT_NORMALIO_DISTANCE; | 918 | NEXT_NORMALIO_DISTANCE; |
921 | 919 | ||
922 | if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) | 920 | if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) |
923 | <= bio->bi_sector) | 921 | <= bio->bi_iter.bi_sector) |
924 | conf->next_window_requests++; | 922 | conf->next_window_requests++; |
925 | else | 923 | else |
926 | conf->current_window_requests++; | 924 | conf->current_window_requests++; |
927 | } | ||
928 | if (bio->bi_sector >= conf->start_next_window) | ||
929 | sector = conf->start_next_window; | 925 | sector = conf->start_next_window; |
926 | } | ||
930 | } | 927 | } |
931 | 928 | ||
932 | conf->nr_pending++; | 929 | conf->nr_pending++; |
@@ -1028,7 +1025,8 @@ do_sync_io: | |||
1028 | if (bvecs[i].bv_page) | 1025 | if (bvecs[i].bv_page) |
1029 | put_page(bvecs[i].bv_page); | 1026 | put_page(bvecs[i].bv_page); |
1030 | kfree(bvecs); | 1027 | kfree(bvecs); |
1031 | pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 1028 | pr_debug("%dB behind alloc failed, doing sync I/O\n", |
1029 | bio->bi_iter.bi_size); | ||
1032 | } | 1030 | } |
1033 | 1031 | ||
1034 | struct raid1_plug_cb { | 1032 | struct raid1_plug_cb { |
@@ -1108,7 +1106,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1108 | 1106 | ||
1109 | if (bio_data_dir(bio) == WRITE && | 1107 | if (bio_data_dir(bio) == WRITE && |
1110 | bio_end_sector(bio) > mddev->suspend_lo && | 1108 | bio_end_sector(bio) > mddev->suspend_lo && |
1111 | bio->bi_sector < mddev->suspend_hi) { | 1109 | bio->bi_iter.bi_sector < mddev->suspend_hi) { |
1112 | /* As the suspend_* range is controlled by | 1110 | /* As the suspend_* range is controlled by |
1113 | * userspace, we want an interruptible | 1111 | * userspace, we want an interruptible |
1114 | * wait. | 1112 | * wait. |
@@ -1119,7 +1117,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1119 | prepare_to_wait(&conf->wait_barrier, | 1117 | prepare_to_wait(&conf->wait_barrier, |
1120 | &w, TASK_INTERRUPTIBLE); | 1118 | &w, TASK_INTERRUPTIBLE); |
1121 | if (bio_end_sector(bio) <= mddev->suspend_lo || | 1119 | if (bio_end_sector(bio) <= mddev->suspend_lo || |
1122 | bio->bi_sector >= mddev->suspend_hi) | 1120 | bio->bi_iter.bi_sector >= mddev->suspend_hi) |
1123 | break; | 1121 | break; |
1124 | schedule(); | 1122 | schedule(); |
1125 | } | 1123 | } |
@@ -1141,7 +1139,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1141 | r1_bio->sectors = bio_sectors(bio); | 1139 | r1_bio->sectors = bio_sectors(bio); |
1142 | r1_bio->state = 0; | 1140 | r1_bio->state = 0; |
1143 | r1_bio->mddev = mddev; | 1141 | r1_bio->mddev = mddev; |
1144 | r1_bio->sector = bio->bi_sector; | 1142 | r1_bio->sector = bio->bi_iter.bi_sector; |
1145 | 1143 | ||
1146 | /* We might need to issue multiple reads to different | 1144 | /* We might need to issue multiple reads to different |
1147 | * devices if there are bad blocks around, so we keep | 1145 | * devices if there are bad blocks around, so we keep |
@@ -1181,12 +1179,13 @@ read_again: | |||
1181 | r1_bio->read_disk = rdisk; | 1179 | r1_bio->read_disk = rdisk; |
1182 | 1180 | ||
1183 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1181 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1184 | bio_trim(read_bio, r1_bio->sector - bio->bi_sector, | 1182 | bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, |
1185 | max_sectors); | 1183 | max_sectors); |
1186 | 1184 | ||
1187 | r1_bio->bios[rdisk] = read_bio; | 1185 | r1_bio->bios[rdisk] = read_bio; |
1188 | 1186 | ||
1189 | read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; | 1187 | read_bio->bi_iter.bi_sector = r1_bio->sector + |
1188 | mirror->rdev->data_offset; | ||
1190 | read_bio->bi_bdev = mirror->rdev->bdev; | 1189 | read_bio->bi_bdev = mirror->rdev->bdev; |
1191 | read_bio->bi_end_io = raid1_end_read_request; | 1190 | read_bio->bi_end_io = raid1_end_read_request; |
1192 | read_bio->bi_rw = READ | do_sync; | 1191 | read_bio->bi_rw = READ | do_sync; |
@@ -1198,7 +1197,7 @@ read_again: | |||
1198 | */ | 1197 | */ |
1199 | 1198 | ||
1200 | sectors_handled = (r1_bio->sector + max_sectors | 1199 | sectors_handled = (r1_bio->sector + max_sectors |
1201 | - bio->bi_sector); | 1200 | - bio->bi_iter.bi_sector); |
1202 | r1_bio->sectors = max_sectors; | 1201 | r1_bio->sectors = max_sectors; |
1203 | spin_lock_irq(&conf->device_lock); | 1202 | spin_lock_irq(&conf->device_lock); |
1204 | if (bio->bi_phys_segments == 0) | 1203 | if (bio->bi_phys_segments == 0) |
@@ -1219,7 +1218,8 @@ read_again: | |||
1219 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | 1218 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; |
1220 | r1_bio->state = 0; | 1219 | r1_bio->state = 0; |
1221 | r1_bio->mddev = mddev; | 1220 | r1_bio->mddev = mddev; |
1222 | r1_bio->sector = bio->bi_sector + sectors_handled; | 1221 | r1_bio->sector = bio->bi_iter.bi_sector + |
1222 | sectors_handled; | ||
1223 | goto read_again; | 1223 | goto read_again; |
1224 | } else | 1224 | } else |
1225 | generic_make_request(read_bio); | 1225 | generic_make_request(read_bio); |
@@ -1322,7 +1322,7 @@ read_again: | |||
1322 | if (r1_bio->bios[j]) | 1322 | if (r1_bio->bios[j]) |
1323 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1323 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
1324 | r1_bio->state = 0; | 1324 | r1_bio->state = 0; |
1325 | allow_barrier(conf, start_next_window, bio->bi_sector); | 1325 | allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); |
1326 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1326 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
1327 | start_next_window = wait_barrier(conf, bio); | 1327 | start_next_window = wait_barrier(conf, bio); |
1328 | /* | 1328 | /* |
@@ -1349,7 +1349,7 @@ read_again: | |||
1349 | bio->bi_phys_segments++; | 1349 | bio->bi_phys_segments++; |
1350 | spin_unlock_irq(&conf->device_lock); | 1350 | spin_unlock_irq(&conf->device_lock); |
1351 | } | 1351 | } |
1352 | sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; | 1352 | sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector; |
1353 | 1353 | ||
1354 | atomic_set(&r1_bio->remaining, 1); | 1354 | atomic_set(&r1_bio->remaining, 1); |
1355 | atomic_set(&r1_bio->behind_remaining, 0); | 1355 | atomic_set(&r1_bio->behind_remaining, 0); |
@@ -1361,7 +1361,7 @@ read_again: | |||
1361 | continue; | 1361 | continue; |
1362 | 1362 | ||
1363 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1363 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1364 | bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors); | 1364 | bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); |
1365 | 1365 | ||
1366 | if (first_clone) { | 1366 | if (first_clone) { |
1367 | /* do behind I/O ? | 1367 | /* do behind I/O ? |
@@ -1395,7 +1395,7 @@ read_again: | |||
1395 | 1395 | ||
1396 | r1_bio->bios[i] = mbio; | 1396 | r1_bio->bios[i] = mbio; |
1397 | 1397 | ||
1398 | mbio->bi_sector = (r1_bio->sector + | 1398 | mbio->bi_iter.bi_sector = (r1_bio->sector + |
1399 | conf->mirrors[i].rdev->data_offset); | 1399 | conf->mirrors[i].rdev->data_offset); |
1400 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1400 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1401 | mbio->bi_end_io = raid1_end_write_request; | 1401 | mbio->bi_end_io = raid1_end_write_request; |
@@ -1435,7 +1435,7 @@ read_again: | |||
1435 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; | 1435 | r1_bio->sectors = bio_sectors(bio) - sectors_handled; |
1436 | r1_bio->state = 0; | 1436 | r1_bio->state = 0; |
1437 | r1_bio->mddev = mddev; | 1437 | r1_bio->mddev = mddev; |
1438 | r1_bio->sector = bio->bi_sector + sectors_handled; | 1438 | r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; |
1439 | goto retry_write; | 1439 | goto retry_write; |
1440 | } | 1440 | } |
1441 | 1441 | ||
@@ -1953,20 +1953,24 @@ static int process_checks(struct r1bio *r1_bio) | |||
1953 | for (i = 0; i < conf->raid_disks * 2; i++) { | 1953 | for (i = 0; i < conf->raid_disks * 2; i++) { |
1954 | int j; | 1954 | int j; |
1955 | int size; | 1955 | int size; |
1956 | int uptodate; | ||
1956 | struct bio *b = r1_bio->bios[i]; | 1957 | struct bio *b = r1_bio->bios[i]; |
1957 | if (b->bi_end_io != end_sync_read) | 1958 | if (b->bi_end_io != end_sync_read) |
1958 | continue; | 1959 | continue; |
1959 | /* fixup the bio for reuse */ | 1960 | /* fixup the bio for reuse, but preserve BIO_UPTODATE */ |
1961 | uptodate = test_bit(BIO_UPTODATE, &b->bi_flags); | ||
1960 | bio_reset(b); | 1962 | bio_reset(b); |
1963 | if (!uptodate) | ||
1964 | clear_bit(BIO_UPTODATE, &b->bi_flags); | ||
1961 | b->bi_vcnt = vcnt; | 1965 | b->bi_vcnt = vcnt; |
1962 | b->bi_size = r1_bio->sectors << 9; | 1966 | b->bi_iter.bi_size = r1_bio->sectors << 9; |
1963 | b->bi_sector = r1_bio->sector + | 1967 | b->bi_iter.bi_sector = r1_bio->sector + |
1964 | conf->mirrors[i].rdev->data_offset; | 1968 | conf->mirrors[i].rdev->data_offset; |
1965 | b->bi_bdev = conf->mirrors[i].rdev->bdev; | 1969 | b->bi_bdev = conf->mirrors[i].rdev->bdev; |
1966 | b->bi_end_io = end_sync_read; | 1970 | b->bi_end_io = end_sync_read; |
1967 | b->bi_private = r1_bio; | 1971 | b->bi_private = r1_bio; |
1968 | 1972 | ||
1969 | size = b->bi_size; | 1973 | size = b->bi_iter.bi_size; |
1970 | for (j = 0; j < vcnt ; j++) { | 1974 | for (j = 0; j < vcnt ; j++) { |
1971 | struct bio_vec *bi; | 1975 | struct bio_vec *bi; |
1972 | bi = &b->bi_io_vec[j]; | 1976 | bi = &b->bi_io_vec[j]; |
@@ -1990,11 +1994,14 @@ static int process_checks(struct r1bio *r1_bio) | |||
1990 | int j; | 1994 | int j; |
1991 | struct bio *pbio = r1_bio->bios[primary]; | 1995 | struct bio *pbio = r1_bio->bios[primary]; |
1992 | struct bio *sbio = r1_bio->bios[i]; | 1996 | struct bio *sbio = r1_bio->bios[i]; |
1997 | int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags); | ||
1993 | 1998 | ||
1994 | if (sbio->bi_end_io != end_sync_read) | 1999 | if (sbio->bi_end_io != end_sync_read) |
1995 | continue; | 2000 | continue; |
2001 | /* Now we can 'fixup' the BIO_UPTODATE flag */ | ||
2002 | set_bit(BIO_UPTODATE, &sbio->bi_flags); | ||
1996 | 2003 | ||
1997 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { | 2004 | if (uptodate) { |
1998 | for (j = vcnt; j-- ; ) { | 2005 | for (j = vcnt; j-- ; ) { |
1999 | struct page *p, *s; | 2006 | struct page *p, *s; |
2000 | p = pbio->bi_io_vec[j].bv_page; | 2007 | p = pbio->bi_io_vec[j].bv_page; |
@@ -2009,7 +2016,7 @@ static int process_checks(struct r1bio *r1_bio) | |||
2009 | if (j >= 0) | 2016 | if (j >= 0) |
2010 | atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); | 2017 | atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); |
2011 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | 2018 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) |
2012 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | 2019 | && uptodate)) { |
2013 | /* No need to write to this device. */ | 2020 | /* No need to write to this device. */ |
2014 | sbio->bi_end_io = NULL; | 2021 | sbio->bi_end_io = NULL; |
2015 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | 2022 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); |
@@ -2221,11 +2228,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) | |||
2221 | } | 2228 | } |
2222 | 2229 | ||
2223 | wbio->bi_rw = WRITE; | 2230 | wbio->bi_rw = WRITE; |
2224 | wbio->bi_sector = r1_bio->sector; | 2231 | wbio->bi_iter.bi_sector = r1_bio->sector; |
2225 | wbio->bi_size = r1_bio->sectors << 9; | 2232 | wbio->bi_iter.bi_size = r1_bio->sectors << 9; |
2226 | 2233 | ||
2227 | bio_trim(wbio, sector - r1_bio->sector, sectors); | 2234 | bio_trim(wbio, sector - r1_bio->sector, sectors); |
2228 | wbio->bi_sector += rdev->data_offset; | 2235 | wbio->bi_iter.bi_sector += rdev->data_offset; |
2229 | wbio->bi_bdev = rdev->bdev; | 2236 | wbio->bi_bdev = rdev->bdev; |
2230 | if (submit_bio_wait(WRITE, wbio) == 0) | 2237 | if (submit_bio_wait(WRITE, wbio) == 0) |
2231 | /* failure! */ | 2238 | /* failure! */ |
@@ -2339,7 +2346,8 @@ read_more: | |||
2339 | } | 2346 | } |
2340 | r1_bio->read_disk = disk; | 2347 | r1_bio->read_disk = disk; |
2341 | bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | 2348 | bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); |
2342 | bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors); | 2349 | bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, |
2350 | max_sectors); | ||
2343 | r1_bio->bios[r1_bio->read_disk] = bio; | 2351 | r1_bio->bios[r1_bio->read_disk] = bio; |
2344 | rdev = conf->mirrors[disk].rdev; | 2352 | rdev = conf->mirrors[disk].rdev; |
2345 | printk_ratelimited(KERN_ERR | 2353 | printk_ratelimited(KERN_ERR |
@@ -2348,7 +2356,7 @@ read_more: | |||
2348 | mdname(mddev), | 2356 | mdname(mddev), |
2349 | (unsigned long long)r1_bio->sector, | 2357 | (unsigned long long)r1_bio->sector, |
2350 | bdevname(rdev->bdev, b)); | 2358 | bdevname(rdev->bdev, b)); |
2351 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | 2359 | bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; |
2352 | bio->bi_bdev = rdev->bdev; | 2360 | bio->bi_bdev = rdev->bdev; |
2353 | bio->bi_end_io = raid1_end_read_request; | 2361 | bio->bi_end_io = raid1_end_read_request; |
2354 | bio->bi_rw = READ | do_sync; | 2362 | bio->bi_rw = READ | do_sync; |
@@ -2357,7 +2365,7 @@ read_more: | |||
2357 | /* Drat - have to split this up more */ | 2365 | /* Drat - have to split this up more */ |
2358 | struct bio *mbio = r1_bio->master_bio; | 2366 | struct bio *mbio = r1_bio->master_bio; |
2359 | int sectors_handled = (r1_bio->sector + max_sectors | 2367 | int sectors_handled = (r1_bio->sector + max_sectors |
2360 | - mbio->bi_sector); | 2368 | - mbio->bi_iter.bi_sector); |
2361 | r1_bio->sectors = max_sectors; | 2369 | r1_bio->sectors = max_sectors; |
2362 | spin_lock_irq(&conf->device_lock); | 2370 | spin_lock_irq(&conf->device_lock); |
2363 | if (mbio->bi_phys_segments == 0) | 2371 | if (mbio->bi_phys_segments == 0) |
@@ -2375,7 +2383,8 @@ read_more: | |||
2375 | r1_bio->state = 0; | 2383 | r1_bio->state = 0; |
2376 | set_bit(R1BIO_ReadError, &r1_bio->state); | 2384 | set_bit(R1BIO_ReadError, &r1_bio->state); |
2377 | r1_bio->mddev = mddev; | 2385 | r1_bio->mddev = mddev; |
2378 | r1_bio->sector = mbio->bi_sector + sectors_handled; | 2386 | r1_bio->sector = mbio->bi_iter.bi_sector + |
2387 | sectors_handled; | ||
2379 | 2388 | ||
2380 | goto read_more; | 2389 | goto read_more; |
2381 | } else | 2390 | } else |
@@ -2599,7 +2608,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2599 | } | 2608 | } |
2600 | if (bio->bi_end_io) { | 2609 | if (bio->bi_end_io) { |
2601 | atomic_inc(&rdev->nr_pending); | 2610 | atomic_inc(&rdev->nr_pending); |
2602 | bio->bi_sector = sector_nr + rdev->data_offset; | 2611 | bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; |
2603 | bio->bi_bdev = rdev->bdev; | 2612 | bio->bi_bdev = rdev->bdev; |
2604 | bio->bi_private = r1_bio; | 2613 | bio->bi_private = r1_bio; |
2605 | } | 2614 | } |
@@ -2699,7 +2708,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2699 | continue; | 2708 | continue; |
2700 | /* remove last page from this bio */ | 2709 | /* remove last page from this bio */ |
2701 | bio->bi_vcnt--; | 2710 | bio->bi_vcnt--; |
2702 | bio->bi_size -= len; | 2711 | bio->bi_iter.bi_size -= len; |
2703 | bio->bi_flags &= ~(1<< BIO_SEG_VALID); | 2712 | bio->bi_flags &= ~(1<< BIO_SEG_VALID); |
2704 | } | 2713 | } |
2705 | goto bio_full; | 2714 | goto bio_full; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c504e8389e69..33fc408e5eac 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -1152,14 +1152,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
1152 | kfree(plug); | 1152 | kfree(plug); |
1153 | } | 1153 | } |
1154 | 1154 | ||
1155 | static void make_request(struct mddev *mddev, struct bio * bio) | 1155 | static void __make_request(struct mddev *mddev, struct bio *bio) |
1156 | { | 1156 | { |
1157 | struct r10conf *conf = mddev->private; | 1157 | struct r10conf *conf = mddev->private; |
1158 | struct r10bio *r10_bio; | 1158 | struct r10bio *r10_bio; |
1159 | struct bio *read_bio; | 1159 | struct bio *read_bio; |
1160 | int i; | 1160 | int i; |
1161 | sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); | ||
1162 | int chunk_sects = chunk_mask + 1; | ||
1163 | const int rw = bio_data_dir(bio); | 1161 | const int rw = bio_data_dir(bio); |
1164 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 1162 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
1165 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1163 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
@@ -1174,88 +1172,27 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1174 | int max_sectors; | 1172 | int max_sectors; |
1175 | int sectors; | 1173 | int sectors; |
1176 | 1174 | ||
1177 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | ||
1178 | md_flush_request(mddev, bio); | ||
1179 | return; | ||
1180 | } | ||
1181 | |||
1182 | /* If this request crosses a chunk boundary, we need to | ||
1183 | * split it. This will only happen for 1 PAGE (or less) requests. | ||
1184 | */ | ||
1185 | if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio) | ||
1186 | > chunk_sects | ||
1187 | && (conf->geo.near_copies < conf->geo.raid_disks | ||
1188 | || conf->prev.near_copies < conf->prev.raid_disks))) { | ||
1189 | struct bio_pair *bp; | ||
1190 | /* Sanity check -- queue functions should prevent this happening */ | ||
1191 | if (bio_segments(bio) > 1) | ||
1192 | goto bad_map; | ||
1193 | /* This is a one page bio that upper layers | ||
1194 | * refuse to split for us, so we need to split it. | ||
1195 | */ | ||
1196 | bp = bio_split(bio, | ||
1197 | chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); | ||
1198 | |||
1199 | /* Each of these 'make_request' calls will call 'wait_barrier'. | ||
1200 | * If the first succeeds but the second blocks due to the resync | ||
1201 | * thread raising the barrier, we will deadlock because the | ||
1202 | * IO to the underlying device will be queued in generic_make_request | ||
1203 | * and will never complete, so will never reduce nr_pending. | ||
1204 | * So increment nr_waiting here so no new raise_barriers will | ||
1205 | * succeed, and so the second wait_barrier cannot block. | ||
1206 | */ | ||
1207 | spin_lock_irq(&conf->resync_lock); | ||
1208 | conf->nr_waiting++; | ||
1209 | spin_unlock_irq(&conf->resync_lock); | ||
1210 | |||
1211 | make_request(mddev, &bp->bio1); | ||
1212 | make_request(mddev, &bp->bio2); | ||
1213 | |||
1214 | spin_lock_irq(&conf->resync_lock); | ||
1215 | conf->nr_waiting--; | ||
1216 | wake_up(&conf->wait_barrier); | ||
1217 | spin_unlock_irq(&conf->resync_lock); | ||
1218 | |||
1219 | bio_pair_release(bp); | ||
1220 | return; | ||
1221 | bad_map: | ||
1222 | printk("md/raid10:%s: make_request bug: can't convert block across chunks" | ||
1223 | " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, | ||
1224 | (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); | ||
1225 | |||
1226 | bio_io_error(bio); | ||
1227 | return; | ||
1228 | } | ||
1229 | |||
1230 | md_write_start(mddev, bio); | ||
1231 | |||
1232 | /* | ||
1233 | * Register the new request and wait if the reconstruction | ||
1234 | * thread has put up a bar for new requests. | ||
1235 | * Continue immediately if no resync is active currently. | ||
1236 | */ | ||
1237 | wait_barrier(conf); | ||
1238 | |||
1239 | sectors = bio_sectors(bio); | 1175 | sectors = bio_sectors(bio); |
1240 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | 1176 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
1241 | bio->bi_sector < conf->reshape_progress && | 1177 | bio->bi_iter.bi_sector < conf->reshape_progress && |
1242 | bio->bi_sector + sectors > conf->reshape_progress) { | 1178 | bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { |
1243 | /* IO spans the reshape position. Need to wait for | 1179 | /* IO spans the reshape position. Need to wait for |
1244 | * reshape to pass | 1180 | * reshape to pass |
1245 | */ | 1181 | */ |
1246 | allow_barrier(conf); | 1182 | allow_barrier(conf); |
1247 | wait_event(conf->wait_barrier, | 1183 | wait_event(conf->wait_barrier, |
1248 | conf->reshape_progress <= bio->bi_sector || | 1184 | conf->reshape_progress <= bio->bi_iter.bi_sector || |
1249 | conf->reshape_progress >= bio->bi_sector + sectors); | 1185 | conf->reshape_progress >= bio->bi_iter.bi_sector + |
1186 | sectors); | ||
1250 | wait_barrier(conf); | 1187 | wait_barrier(conf); |
1251 | } | 1188 | } |
1252 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | 1189 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
1253 | bio_data_dir(bio) == WRITE && | 1190 | bio_data_dir(bio) == WRITE && |
1254 | (mddev->reshape_backwards | 1191 | (mddev->reshape_backwards |
1255 | ? (bio->bi_sector < conf->reshape_safe && | 1192 | ? (bio->bi_iter.bi_sector < conf->reshape_safe && |
1256 | bio->bi_sector + sectors > conf->reshape_progress) | 1193 | bio->bi_iter.bi_sector + sectors > conf->reshape_progress) |
1257 | : (bio->bi_sector + sectors > conf->reshape_safe && | 1194 | : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && |
1258 | bio->bi_sector < conf->reshape_progress))) { | 1195 | bio->bi_iter.bi_sector < conf->reshape_progress))) { |
1259 | /* Need to update reshape_position in metadata */ | 1196 | /* Need to update reshape_position in metadata */ |
1260 | mddev->reshape_position = conf->reshape_progress; | 1197 | mddev->reshape_position = conf->reshape_progress; |
1261 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1198 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
@@ -1273,7 +1210,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1273 | r10_bio->sectors = sectors; | 1210 | r10_bio->sectors = sectors; |
1274 | 1211 | ||
1275 | r10_bio->mddev = mddev; | 1212 | r10_bio->mddev = mddev; |
1276 | r10_bio->sector = bio->bi_sector; | 1213 | r10_bio->sector = bio->bi_iter.bi_sector; |
1277 | r10_bio->state = 0; | 1214 | r10_bio->state = 0; |
1278 | 1215 | ||
1279 | /* We might need to issue multiple reads to different | 1216 | /* We might need to issue multiple reads to different |
@@ -1302,13 +1239,13 @@ read_again: | |||
1302 | slot = r10_bio->read_slot; | 1239 | slot = r10_bio->read_slot; |
1303 | 1240 | ||
1304 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1241 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1305 | bio_trim(read_bio, r10_bio->sector - bio->bi_sector, | 1242 | bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, |
1306 | max_sectors); | 1243 | max_sectors); |
1307 | 1244 | ||
1308 | r10_bio->devs[slot].bio = read_bio; | 1245 | r10_bio->devs[slot].bio = read_bio; |
1309 | r10_bio->devs[slot].rdev = rdev; | 1246 | r10_bio->devs[slot].rdev = rdev; |
1310 | 1247 | ||
1311 | read_bio->bi_sector = r10_bio->devs[slot].addr + | 1248 | read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + |
1312 | choose_data_offset(r10_bio, rdev); | 1249 | choose_data_offset(r10_bio, rdev); |
1313 | read_bio->bi_bdev = rdev->bdev; | 1250 | read_bio->bi_bdev = rdev->bdev; |
1314 | read_bio->bi_end_io = raid10_end_read_request; | 1251 | read_bio->bi_end_io = raid10_end_read_request; |
@@ -1319,15 +1256,15 @@ read_again: | |||
1319 | /* Could not read all from this device, so we will | 1256 | /* Could not read all from this device, so we will |
1320 | * need another r10_bio. | 1257 | * need another r10_bio. |
1321 | */ | 1258 | */ |
1322 | sectors_handled = (r10_bio->sectors + max_sectors | 1259 | sectors_handled = (r10_bio->sector + max_sectors |
1323 | - bio->bi_sector); | 1260 | - bio->bi_iter.bi_sector); |
1324 | r10_bio->sectors = max_sectors; | 1261 | r10_bio->sectors = max_sectors; |
1325 | spin_lock_irq(&conf->device_lock); | 1262 | spin_lock_irq(&conf->device_lock); |
1326 | if (bio->bi_phys_segments == 0) | 1263 | if (bio->bi_phys_segments == 0) |
1327 | bio->bi_phys_segments = 2; | 1264 | bio->bi_phys_segments = 2; |
1328 | else | 1265 | else |
1329 | bio->bi_phys_segments++; | 1266 | bio->bi_phys_segments++; |
1330 | spin_unlock(&conf->device_lock); | 1267 | spin_unlock_irq(&conf->device_lock); |
1331 | /* Cannot call generic_make_request directly | 1268 | /* Cannot call generic_make_request directly |
1332 | * as that will be queued in __generic_make_request | 1269 | * as that will be queued in __generic_make_request |
1333 | * and subsequent mempool_alloc might block | 1270 | * and subsequent mempool_alloc might block |
@@ -1341,7 +1278,8 @@ read_again: | |||
1341 | r10_bio->sectors = bio_sectors(bio) - sectors_handled; | 1278 | r10_bio->sectors = bio_sectors(bio) - sectors_handled; |
1342 | r10_bio->state = 0; | 1279 | r10_bio->state = 0; |
1343 | r10_bio->mddev = mddev; | 1280 | r10_bio->mddev = mddev; |
1344 | r10_bio->sector = bio->bi_sector + sectors_handled; | 1281 | r10_bio->sector = bio->bi_iter.bi_sector + |
1282 | sectors_handled; | ||
1345 | goto read_again; | 1283 | goto read_again; |
1346 | } else | 1284 | } else |
1347 | generic_make_request(read_bio); | 1285 | generic_make_request(read_bio); |
@@ -1499,7 +1437,8 @@ retry_write: | |||
1499 | bio->bi_phys_segments++; | 1437 | bio->bi_phys_segments++; |
1500 | spin_unlock_irq(&conf->device_lock); | 1438 | spin_unlock_irq(&conf->device_lock); |
1501 | } | 1439 | } |
1502 | sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; | 1440 | sectors_handled = r10_bio->sector + max_sectors - |
1441 | bio->bi_iter.bi_sector; | ||
1503 | 1442 | ||
1504 | atomic_set(&r10_bio->remaining, 1); | 1443 | atomic_set(&r10_bio->remaining, 1); |
1505 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); | 1444 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); |
@@ -1510,11 +1449,11 @@ retry_write: | |||
1510 | if (r10_bio->devs[i].bio) { | 1449 | if (r10_bio->devs[i].bio) { |
1511 | struct md_rdev *rdev = conf->mirrors[d].rdev; | 1450 | struct md_rdev *rdev = conf->mirrors[d].rdev; |
1512 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1451 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1513 | bio_trim(mbio, r10_bio->sector - bio->bi_sector, | 1452 | bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, |
1514 | max_sectors); | 1453 | max_sectors); |
1515 | r10_bio->devs[i].bio = mbio; | 1454 | r10_bio->devs[i].bio = mbio; |
1516 | 1455 | ||
1517 | mbio->bi_sector = (r10_bio->devs[i].addr+ | 1456 | mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ |
1518 | choose_data_offset(r10_bio, | 1457 | choose_data_offset(r10_bio, |
1519 | rdev)); | 1458 | rdev)); |
1520 | mbio->bi_bdev = rdev->bdev; | 1459 | mbio->bi_bdev = rdev->bdev; |
@@ -1553,11 +1492,11 @@ retry_write: | |||
1553 | rdev = conf->mirrors[d].rdev; | 1492 | rdev = conf->mirrors[d].rdev; |
1554 | } | 1493 | } |
1555 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1494 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1556 | bio_trim(mbio, r10_bio->sector - bio->bi_sector, | 1495 | bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, |
1557 | max_sectors); | 1496 | max_sectors); |
1558 | r10_bio->devs[i].repl_bio = mbio; | 1497 | r10_bio->devs[i].repl_bio = mbio; |
1559 | 1498 | ||
1560 | mbio->bi_sector = (r10_bio->devs[i].addr + | 1499 | mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr + |
1561 | choose_data_offset( | 1500 | choose_data_offset( |
1562 | r10_bio, rdev)); | 1501 | r10_bio, rdev)); |
1563 | mbio->bi_bdev = rdev->bdev; | 1502 | mbio->bi_bdev = rdev->bdev; |
@@ -1591,11 +1530,57 @@ retry_write: | |||
1591 | r10_bio->sectors = bio_sectors(bio) - sectors_handled; | 1530 | r10_bio->sectors = bio_sectors(bio) - sectors_handled; |
1592 | 1531 | ||
1593 | r10_bio->mddev = mddev; | 1532 | r10_bio->mddev = mddev; |
1594 | r10_bio->sector = bio->bi_sector + sectors_handled; | 1533 | r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; |
1595 | r10_bio->state = 0; | 1534 | r10_bio->state = 0; |
1596 | goto retry_write; | 1535 | goto retry_write; |
1597 | } | 1536 | } |
1598 | one_write_done(r10_bio); | 1537 | one_write_done(r10_bio); |
1538 | } | ||
1539 | |||
1540 | static void make_request(struct mddev *mddev, struct bio *bio) | ||
1541 | { | ||
1542 | struct r10conf *conf = mddev->private; | ||
1543 | sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); | ||
1544 | int chunk_sects = chunk_mask + 1; | ||
1545 | |||
1546 | struct bio *split; | ||
1547 | |||
1548 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | ||
1549 | md_flush_request(mddev, bio); | ||
1550 | return; | ||
1551 | } | ||
1552 | |||
1553 | md_write_start(mddev, bio); | ||
1554 | |||
1555 | /* | ||
1556 | * Register the new request and wait if the reconstruction | ||
1557 | * thread has put up a bar for new requests. | ||
1558 | * Continue immediately if no resync is active currently. | ||
1559 | */ | ||
1560 | wait_barrier(conf); | ||
1561 | |||
1562 | do { | ||
1563 | |||
1564 | /* | ||
1565 | * If this request crosses a chunk boundary, we need to split | ||
1566 | * it. | ||
1567 | */ | ||
1568 | if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + | ||
1569 | bio_sectors(bio) > chunk_sects | ||
1570 | && (conf->geo.near_copies < conf->geo.raid_disks | ||
1571 | || conf->prev.near_copies < | ||
1572 | conf->prev.raid_disks))) { | ||
1573 | split = bio_split(bio, chunk_sects - | ||
1574 | (bio->bi_iter.bi_sector & | ||
1575 | (chunk_sects - 1)), | ||
1576 | GFP_NOIO, fs_bio_set); | ||
1577 | bio_chain(split, bio); | ||
1578 | } else { | ||
1579 | split = bio; | ||
1580 | } | ||
1581 | |||
1582 | __make_request(mddev, split); | ||
1583 | } while (split != bio); | ||
1599 | 1584 | ||
1600 | /* In case raid10d snuck in to freeze_array */ | 1585 | /* In case raid10d snuck in to freeze_array */ |
1601 | wake_up(&conf->wait_barrier); | 1586 | wake_up(&conf->wait_barrier); |
@@ -2124,10 +2109,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
2124 | bio_reset(tbio); | 2109 | bio_reset(tbio); |
2125 | 2110 | ||
2126 | tbio->bi_vcnt = vcnt; | 2111 | tbio->bi_vcnt = vcnt; |
2127 | tbio->bi_size = r10_bio->sectors << 9; | 2112 | tbio->bi_iter.bi_size = r10_bio->sectors << 9; |
2128 | tbio->bi_rw = WRITE; | 2113 | tbio->bi_rw = WRITE; |
2129 | tbio->bi_private = r10_bio; | 2114 | tbio->bi_private = r10_bio; |
2130 | tbio->bi_sector = r10_bio->devs[i].addr; | 2115 | tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; |
2131 | 2116 | ||
2132 | for (j=0; j < vcnt ; j++) { | 2117 | for (j=0; j < vcnt ; j++) { |
2133 | tbio->bi_io_vec[j].bv_offset = 0; | 2118 | tbio->bi_io_vec[j].bv_offset = 0; |
@@ -2144,7 +2129,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
2144 | atomic_inc(&r10_bio->remaining); | 2129 | atomic_inc(&r10_bio->remaining); |
2145 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); | 2130 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); |
2146 | 2131 | ||
2147 | tbio->bi_sector += conf->mirrors[d].rdev->data_offset; | 2132 | tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; |
2148 | tbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2133 | tbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
2149 | generic_make_request(tbio); | 2134 | generic_make_request(tbio); |
2150 | } | 2135 | } |
@@ -2614,8 +2599,8 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) | |||
2614 | sectors = sect_to_write; | 2599 | sectors = sect_to_write; |
2615 | /* Write at 'sector' for 'sectors' */ | 2600 | /* Write at 'sector' for 'sectors' */ |
2616 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 2601 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
2617 | bio_trim(wbio, sector - bio->bi_sector, sectors); | 2602 | bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); |
2618 | wbio->bi_sector = (r10_bio->devs[i].addr+ | 2603 | wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ |
2619 | choose_data_offset(r10_bio, rdev) + | 2604 | choose_data_offset(r10_bio, rdev) + |
2620 | (sector - r10_bio->sector)); | 2605 | (sector - r10_bio->sector)); |
2621 | wbio->bi_bdev = rdev->bdev; | 2606 | wbio->bi_bdev = rdev->bdev; |
@@ -2687,10 +2672,10 @@ read_more: | |||
2687 | (unsigned long long)r10_bio->sector); | 2672 | (unsigned long long)r10_bio->sector); |
2688 | bio = bio_clone_mddev(r10_bio->master_bio, | 2673 | bio = bio_clone_mddev(r10_bio->master_bio, |
2689 | GFP_NOIO, mddev); | 2674 | GFP_NOIO, mddev); |
2690 | bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors); | 2675 | bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); |
2691 | r10_bio->devs[slot].bio = bio; | 2676 | r10_bio->devs[slot].bio = bio; |
2692 | r10_bio->devs[slot].rdev = rdev; | 2677 | r10_bio->devs[slot].rdev = rdev; |
2693 | bio->bi_sector = r10_bio->devs[slot].addr | 2678 | bio->bi_iter.bi_sector = r10_bio->devs[slot].addr |
2694 | + choose_data_offset(r10_bio, rdev); | 2679 | + choose_data_offset(r10_bio, rdev); |
2695 | bio->bi_bdev = rdev->bdev; | 2680 | bio->bi_bdev = rdev->bdev; |
2696 | bio->bi_rw = READ | do_sync; | 2681 | bio->bi_rw = READ | do_sync; |
@@ -2701,7 +2686,7 @@ read_more: | |||
2701 | struct bio *mbio = r10_bio->master_bio; | 2686 | struct bio *mbio = r10_bio->master_bio; |
2702 | int sectors_handled = | 2687 | int sectors_handled = |
2703 | r10_bio->sector + max_sectors | 2688 | r10_bio->sector + max_sectors |
2704 | - mbio->bi_sector; | 2689 | - mbio->bi_iter.bi_sector; |
2705 | r10_bio->sectors = max_sectors; | 2690 | r10_bio->sectors = max_sectors; |
2706 | spin_lock_irq(&conf->device_lock); | 2691 | spin_lock_irq(&conf->device_lock); |
2707 | if (mbio->bi_phys_segments == 0) | 2692 | if (mbio->bi_phys_segments == 0) |
@@ -2719,7 +2704,7 @@ read_more: | |||
2719 | set_bit(R10BIO_ReadError, | 2704 | set_bit(R10BIO_ReadError, |
2720 | &r10_bio->state); | 2705 | &r10_bio->state); |
2721 | r10_bio->mddev = mddev; | 2706 | r10_bio->mddev = mddev; |
2722 | r10_bio->sector = mbio->bi_sector | 2707 | r10_bio->sector = mbio->bi_iter.bi_sector |
2723 | + sectors_handled; | 2708 | + sectors_handled; |
2724 | 2709 | ||
2725 | goto read_more; | 2710 | goto read_more; |
@@ -3157,7 +3142,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3157 | bio->bi_end_io = end_sync_read; | 3142 | bio->bi_end_io = end_sync_read; |
3158 | bio->bi_rw = READ; | 3143 | bio->bi_rw = READ; |
3159 | from_addr = r10_bio->devs[j].addr; | 3144 | from_addr = r10_bio->devs[j].addr; |
3160 | bio->bi_sector = from_addr + rdev->data_offset; | 3145 | bio->bi_iter.bi_sector = from_addr + |
3146 | rdev->data_offset; | ||
3161 | bio->bi_bdev = rdev->bdev; | 3147 | bio->bi_bdev = rdev->bdev; |
3162 | atomic_inc(&rdev->nr_pending); | 3148 | atomic_inc(&rdev->nr_pending); |
3163 | /* and we write to 'i' (if not in_sync) */ | 3149 | /* and we write to 'i' (if not in_sync) */ |
@@ -3181,7 +3167,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3181 | bio->bi_private = r10_bio; | 3167 | bio->bi_private = r10_bio; |
3182 | bio->bi_end_io = end_sync_write; | 3168 | bio->bi_end_io = end_sync_write; |
3183 | bio->bi_rw = WRITE; | 3169 | bio->bi_rw = WRITE; |
3184 | bio->bi_sector = to_addr | 3170 | bio->bi_iter.bi_sector = to_addr |
3185 | + rdev->data_offset; | 3171 | + rdev->data_offset; |
3186 | bio->bi_bdev = rdev->bdev; | 3172 | bio->bi_bdev = rdev->bdev; |
3187 | atomic_inc(&r10_bio->remaining); | 3173 | atomic_inc(&r10_bio->remaining); |
@@ -3210,7 +3196,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3210 | bio->bi_private = r10_bio; | 3196 | bio->bi_private = r10_bio; |
3211 | bio->bi_end_io = end_sync_write; | 3197 | bio->bi_end_io = end_sync_write; |
3212 | bio->bi_rw = WRITE; | 3198 | bio->bi_rw = WRITE; |
3213 | bio->bi_sector = to_addr + rdev->data_offset; | 3199 | bio->bi_iter.bi_sector = to_addr + |
3200 | rdev->data_offset; | ||
3214 | bio->bi_bdev = rdev->bdev; | 3201 | bio->bi_bdev = rdev->bdev; |
3215 | atomic_inc(&r10_bio->remaining); | 3202 | atomic_inc(&r10_bio->remaining); |
3216 | break; | 3203 | break; |
@@ -3218,10 +3205,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3218 | if (j == conf->copies) { | 3205 | if (j == conf->copies) { |
3219 | /* Cannot recover, so abort the recovery or | 3206 | /* Cannot recover, so abort the recovery or |
3220 | * record a bad block */ | 3207 | * record a bad block */ |
3221 | put_buf(r10_bio); | ||
3222 | if (rb2) | ||
3223 | atomic_dec(&rb2->remaining); | ||
3224 | r10_bio = rb2; | ||
3225 | if (any_working) { | 3208 | if (any_working) { |
3226 | /* problem is that there are bad blocks | 3209 | /* problem is that there are bad blocks |
3227 | * on other device(s) | 3210 | * on other device(s) |
@@ -3253,6 +3236,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3253 | mirror->recovery_disabled | 3236 | mirror->recovery_disabled |
3254 | = mddev->recovery_disabled; | 3237 | = mddev->recovery_disabled; |
3255 | } | 3238 | } |
3239 | put_buf(r10_bio); | ||
3240 | if (rb2) | ||
3241 | atomic_dec(&rb2->remaining); | ||
3242 | r10_bio = rb2; | ||
3256 | break; | 3243 | break; |
3257 | } | 3244 | } |
3258 | } | 3245 | } |
@@ -3328,7 +3315,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3328 | bio->bi_private = r10_bio; | 3315 | bio->bi_private = r10_bio; |
3329 | bio->bi_end_io = end_sync_read; | 3316 | bio->bi_end_io = end_sync_read; |
3330 | bio->bi_rw = READ; | 3317 | bio->bi_rw = READ; |
3331 | bio->bi_sector = sector + | 3318 | bio->bi_iter.bi_sector = sector + |
3332 | conf->mirrors[d].rdev->data_offset; | 3319 | conf->mirrors[d].rdev->data_offset; |
3333 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 3320 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
3334 | count++; | 3321 | count++; |
@@ -3350,7 +3337,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3350 | bio->bi_private = r10_bio; | 3337 | bio->bi_private = r10_bio; |
3351 | bio->bi_end_io = end_sync_write; | 3338 | bio->bi_end_io = end_sync_write; |
3352 | bio->bi_rw = WRITE; | 3339 | bio->bi_rw = WRITE; |
3353 | bio->bi_sector = sector + | 3340 | bio->bi_iter.bi_sector = sector + |
3354 | conf->mirrors[d].replacement->data_offset; | 3341 | conf->mirrors[d].replacement->data_offset; |
3355 | bio->bi_bdev = conf->mirrors[d].replacement->bdev; | 3342 | bio->bi_bdev = conf->mirrors[d].replacement->bdev; |
3356 | count++; | 3343 | count++; |
@@ -3397,7 +3384,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3397 | bio2 = bio2->bi_next) { | 3384 | bio2 = bio2->bi_next) { |
3398 | /* remove last page from this bio */ | 3385 | /* remove last page from this bio */ |
3399 | bio2->bi_vcnt--; | 3386 | bio2->bi_vcnt--; |
3400 | bio2->bi_size -= len; | 3387 | bio2->bi_iter.bi_size -= len; |
3401 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); | 3388 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); |
3402 | } | 3389 | } |
3403 | goto bio_full; | 3390 | goto bio_full; |
@@ -3747,7 +3734,8 @@ static int run(struct mddev *mddev) | |||
3747 | !test_bit(In_sync, &disk->rdev->flags)) { | 3734 | !test_bit(In_sync, &disk->rdev->flags)) { |
3748 | disk->head_position = 0; | 3735 | disk->head_position = 0; |
3749 | mddev->degraded++; | 3736 | mddev->degraded++; |
3750 | if (disk->rdev) | 3737 | if (disk->rdev && |
3738 | disk->rdev->saved_raid_disk < 0) | ||
3751 | conf->fullsync = 1; | 3739 | conf->fullsync = 1; |
3752 | } | 3740 | } |
3753 | disk->recovery_disabled = mddev->recovery_disabled - 1; | 3741 | disk->recovery_disabled = mddev->recovery_disabled - 1; |
@@ -4417,7 +4405,7 @@ read_more: | |||
4417 | read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); | 4405 | read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); |
4418 | 4406 | ||
4419 | read_bio->bi_bdev = rdev->bdev; | 4407 | read_bio->bi_bdev = rdev->bdev; |
4420 | read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr | 4408 | read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr |
4421 | + rdev->data_offset); | 4409 | + rdev->data_offset); |
4422 | read_bio->bi_private = r10_bio; | 4410 | read_bio->bi_private = r10_bio; |
4423 | read_bio->bi_end_io = end_sync_read; | 4411 | read_bio->bi_end_io = end_sync_read; |
@@ -4425,7 +4413,7 @@ read_more: | |||
4425 | read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); | 4413 | read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); |
4426 | read_bio->bi_flags |= 1 << BIO_UPTODATE; | 4414 | read_bio->bi_flags |= 1 << BIO_UPTODATE; |
4427 | read_bio->bi_vcnt = 0; | 4415 | read_bio->bi_vcnt = 0; |
4428 | read_bio->bi_size = 0; | 4416 | read_bio->bi_iter.bi_size = 0; |
4429 | r10_bio->master_bio = read_bio; | 4417 | r10_bio->master_bio = read_bio; |
4430 | r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; | 4418 | r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; |
4431 | 4419 | ||
@@ -4451,7 +4439,8 @@ read_more: | |||
4451 | 4439 | ||
4452 | bio_reset(b); | 4440 | bio_reset(b); |
4453 | b->bi_bdev = rdev2->bdev; | 4441 | b->bi_bdev = rdev2->bdev; |
4454 | b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; | 4442 | b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + |
4443 | rdev2->new_data_offset; | ||
4455 | b->bi_private = r10_bio; | 4444 | b->bi_private = r10_bio; |
4456 | b->bi_end_io = end_reshape_write; | 4445 | b->bi_end_io = end_reshape_write; |
4457 | b->bi_rw = WRITE; | 4446 | b->bi_rw = WRITE; |
@@ -4478,7 +4467,7 @@ read_more: | |||
4478 | bio2 = bio2->bi_next) { | 4467 | bio2 = bio2->bi_next) { |
4479 | /* Remove last page from this bio */ | 4468 | /* Remove last page from this bio */ |
4480 | bio2->bi_vcnt--; | 4469 | bio2->bi_vcnt--; |
4481 | bio2->bi_size -= len; | 4470 | bio2->bi_iter.bi_size -= len; |
4482 | bio2->bi_flags &= ~(1<<BIO_SEG_VALID); | 4471 | bio2->bi_flags &= ~(1<<BIO_SEG_VALID); |
4483 | } | 4472 | } |
4484 | goto bio_full; | 4473 | goto bio_full; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cc055da02e2a..16f5c21963db 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -133,7 +133,7 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) | |||
133 | static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) | 133 | static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) |
134 | { | 134 | { |
135 | int sectors = bio_sectors(bio); | 135 | int sectors = bio_sectors(bio); |
136 | if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) | 136 | if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) |
137 | return bio->bi_next; | 137 | return bio->bi_next; |
138 | else | 138 | else |
139 | return NULL; | 139 | return NULL; |
@@ -225,7 +225,7 @@ static void return_io(struct bio *return_bi) | |||
225 | 225 | ||
226 | return_bi = bi->bi_next; | 226 | return_bi = bi->bi_next; |
227 | bi->bi_next = NULL; | 227 | bi->bi_next = NULL; |
228 | bi->bi_size = 0; | 228 | bi->bi_iter.bi_size = 0; |
229 | trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), | 229 | trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), |
230 | bi, 0); | 230 | bi, 0); |
231 | bio_endio(bi, 0); | 231 | bio_endio(bi, 0); |
@@ -675,8 +675,10 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
675 | || !conf->inactive_blocked), | 675 | || !conf->inactive_blocked), |
676 | *(conf->hash_locks + hash)); | 676 | *(conf->hash_locks + hash)); |
677 | conf->inactive_blocked = 0; | 677 | conf->inactive_blocked = 0; |
678 | } else | 678 | } else { |
679 | init_stripe(sh, sector, previous); | 679 | init_stripe(sh, sector, previous); |
680 | atomic_inc(&sh->count); | ||
681 | } | ||
680 | } else { | 682 | } else { |
681 | spin_lock(&conf->device_lock); | 683 | spin_lock(&conf->device_lock); |
682 | if (atomic_read(&sh->count)) { | 684 | if (atomic_read(&sh->count)) { |
@@ -687,20 +689,19 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
687 | } else { | 689 | } else { |
688 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | 690 | if (!test_bit(STRIPE_HANDLE, &sh->state)) |
689 | atomic_inc(&conf->active_stripes); | 691 | atomic_inc(&conf->active_stripes); |
690 | BUG_ON(list_empty(&sh->lru)); | 692 | BUG_ON(list_empty(&sh->lru) && |
693 | !test_bit(STRIPE_EXPANDING, &sh->state)); | ||
691 | list_del_init(&sh->lru); | 694 | list_del_init(&sh->lru); |
692 | if (sh->group) { | 695 | if (sh->group) { |
693 | sh->group->stripes_cnt--; | 696 | sh->group->stripes_cnt--; |
694 | sh->group = NULL; | 697 | sh->group = NULL; |
695 | } | 698 | } |
696 | } | 699 | } |
700 | atomic_inc(&sh->count); | ||
697 | spin_unlock(&conf->device_lock); | 701 | spin_unlock(&conf->device_lock); |
698 | } | 702 | } |
699 | } while (sh == NULL); | 703 | } while (sh == NULL); |
700 | 704 | ||
701 | if (sh) | ||
702 | atomic_inc(&sh->count); | ||
703 | |||
704 | spin_unlock_irq(conf->hash_locks + hash); | 705 | spin_unlock_irq(conf->hash_locks + hash); |
705 | return sh; | 706 | return sh; |
706 | } | 707 | } |
@@ -851,10 +852,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
851 | bi->bi_rw, i); | 852 | bi->bi_rw, i); |
852 | atomic_inc(&sh->count); | 853 | atomic_inc(&sh->count); |
853 | if (use_new_offset(conf, sh)) | 854 | if (use_new_offset(conf, sh)) |
854 | bi->bi_sector = (sh->sector | 855 | bi->bi_iter.bi_sector = (sh->sector |
855 | + rdev->new_data_offset); | 856 | + rdev->new_data_offset); |
856 | else | 857 | else |
857 | bi->bi_sector = (sh->sector | 858 | bi->bi_iter.bi_sector = (sh->sector |
858 | + rdev->data_offset); | 859 | + rdev->data_offset); |
859 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | 860 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
860 | bi->bi_rw |= REQ_NOMERGE; | 861 | bi->bi_rw |= REQ_NOMERGE; |
@@ -862,7 +863,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
862 | bi->bi_vcnt = 1; | 863 | bi->bi_vcnt = 1; |
863 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 864 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
864 | bi->bi_io_vec[0].bv_offset = 0; | 865 | bi->bi_io_vec[0].bv_offset = 0; |
865 | bi->bi_size = STRIPE_SIZE; | 866 | bi->bi_iter.bi_size = STRIPE_SIZE; |
866 | /* | 867 | /* |
867 | * If this is discard request, set bi_vcnt 0. We don't | 868 | * If this is discard request, set bi_vcnt 0. We don't |
868 | * want to confuse SCSI because SCSI will replace payload | 869 | * want to confuse SCSI because SCSI will replace payload |
@@ -898,15 +899,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
898 | rbi->bi_rw, i); | 899 | rbi->bi_rw, i); |
899 | atomic_inc(&sh->count); | 900 | atomic_inc(&sh->count); |
900 | if (use_new_offset(conf, sh)) | 901 | if (use_new_offset(conf, sh)) |
901 | rbi->bi_sector = (sh->sector | 902 | rbi->bi_iter.bi_sector = (sh->sector |
902 | + rrdev->new_data_offset); | 903 | + rrdev->new_data_offset); |
903 | else | 904 | else |
904 | rbi->bi_sector = (sh->sector | 905 | rbi->bi_iter.bi_sector = (sh->sector |
905 | + rrdev->data_offset); | 906 | + rrdev->data_offset); |
906 | rbi->bi_vcnt = 1; | 907 | rbi->bi_vcnt = 1; |
907 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 908 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
908 | rbi->bi_io_vec[0].bv_offset = 0; | 909 | rbi->bi_io_vec[0].bv_offset = 0; |
909 | rbi->bi_size = STRIPE_SIZE; | 910 | rbi->bi_iter.bi_size = STRIPE_SIZE; |
910 | /* | 911 | /* |
911 | * If this is discard request, set bi_vcnt 0. We don't | 912 | * If this is discard request, set bi_vcnt 0. We don't |
912 | * want to confuse SCSI because SCSI will replace payload | 913 | * want to confuse SCSI because SCSI will replace payload |
@@ -934,24 +935,24 @@ static struct dma_async_tx_descriptor * | |||
934 | async_copy_data(int frombio, struct bio *bio, struct page *page, | 935 | async_copy_data(int frombio, struct bio *bio, struct page *page, |
935 | sector_t sector, struct dma_async_tx_descriptor *tx) | 936 | sector_t sector, struct dma_async_tx_descriptor *tx) |
936 | { | 937 | { |
937 | struct bio_vec *bvl; | 938 | struct bio_vec bvl; |
939 | struct bvec_iter iter; | ||
938 | struct page *bio_page; | 940 | struct page *bio_page; |
939 | int i; | ||
940 | int page_offset; | 941 | int page_offset; |
941 | struct async_submit_ctl submit; | 942 | struct async_submit_ctl submit; |
942 | enum async_tx_flags flags = 0; | 943 | enum async_tx_flags flags = 0; |
943 | 944 | ||
944 | if (bio->bi_sector >= sector) | 945 | if (bio->bi_iter.bi_sector >= sector) |
945 | page_offset = (signed)(bio->bi_sector - sector) * 512; | 946 | page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; |
946 | else | 947 | else |
947 | page_offset = (signed)(sector - bio->bi_sector) * -512; | 948 | page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; |
948 | 949 | ||
949 | if (frombio) | 950 | if (frombio) |
950 | flags |= ASYNC_TX_FENCE; | 951 | flags |= ASYNC_TX_FENCE; |
951 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); | 952 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); |
952 | 953 | ||
953 | bio_for_each_segment(bvl, bio, i) { | 954 | bio_for_each_segment(bvl, bio, iter) { |
954 | int len = bvl->bv_len; | 955 | int len = bvl.bv_len; |
955 | int clen; | 956 | int clen; |
956 | int b_offset = 0; | 957 | int b_offset = 0; |
957 | 958 | ||
@@ -967,8 +968,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
967 | clen = len; | 968 | clen = len; |
968 | 969 | ||
969 | if (clen > 0) { | 970 | if (clen > 0) { |
970 | b_offset += bvl->bv_offset; | 971 | b_offset += bvl.bv_offset; |
971 | bio_page = bvl->bv_page; | 972 | bio_page = bvl.bv_page; |
972 | if (frombio) | 973 | if (frombio) |
973 | tx = async_memcpy(page, bio_page, page_offset, | 974 | tx = async_memcpy(page, bio_page, page_offset, |
974 | b_offset, clen, &submit); | 975 | b_offset, clen, &submit); |
@@ -1011,7 +1012,7 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
1011 | BUG_ON(!dev->read); | 1012 | BUG_ON(!dev->read); |
1012 | rbi = dev->read; | 1013 | rbi = dev->read; |
1013 | dev->read = NULL; | 1014 | dev->read = NULL; |
1014 | while (rbi && rbi->bi_sector < | 1015 | while (rbi && rbi->bi_iter.bi_sector < |
1015 | dev->sector + STRIPE_SECTORS) { | 1016 | dev->sector + STRIPE_SECTORS) { |
1016 | rbi2 = r5_next_bio(rbi, dev->sector); | 1017 | rbi2 = r5_next_bio(rbi, dev->sector); |
1017 | if (!raid5_dec_bi_active_stripes(rbi)) { | 1018 | if (!raid5_dec_bi_active_stripes(rbi)) { |
@@ -1047,7 +1048,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
1047 | dev->read = rbi = dev->toread; | 1048 | dev->read = rbi = dev->toread; |
1048 | dev->toread = NULL; | 1049 | dev->toread = NULL; |
1049 | spin_unlock_irq(&sh->stripe_lock); | 1050 | spin_unlock_irq(&sh->stripe_lock); |
1050 | while (rbi && rbi->bi_sector < | 1051 | while (rbi && rbi->bi_iter.bi_sector < |
1051 | dev->sector + STRIPE_SECTORS) { | 1052 | dev->sector + STRIPE_SECTORS) { |
1052 | tx = async_copy_data(0, rbi, dev->page, | 1053 | tx = async_copy_data(0, rbi, dev->page, |
1053 | dev->sector, tx); | 1054 | dev->sector, tx); |
@@ -1389,7 +1390,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1389 | wbi = dev->written = chosen; | 1390 | wbi = dev->written = chosen; |
1390 | spin_unlock_irq(&sh->stripe_lock); | 1391 | spin_unlock_irq(&sh->stripe_lock); |
1391 | 1392 | ||
1392 | while (wbi && wbi->bi_sector < | 1393 | while (wbi && wbi->bi_iter.bi_sector < |
1393 | dev->sector + STRIPE_SECTORS) { | 1394 | dev->sector + STRIPE_SECTORS) { |
1394 | if (wbi->bi_rw & REQ_FUA) | 1395 | if (wbi->bi_rw & REQ_FUA) |
1395 | set_bit(R5_WantFUA, &dev->flags); | 1396 | set_bit(R5_WantFUA, &dev->flags); |
@@ -2110,6 +2111,7 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
2110 | set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); | 2111 | set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); |
2111 | } else { | 2112 | } else { |
2112 | if (!uptodate) { | 2113 | if (!uptodate) { |
2114 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
2113 | set_bit(WriteErrorSeen, &rdev->flags); | 2115 | set_bit(WriteErrorSeen, &rdev->flags); |
2114 | set_bit(R5_WriteError, &sh->dev[i].flags); | 2116 | set_bit(R5_WriteError, &sh->dev[i].flags); |
2115 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | 2117 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) |
@@ -2613,7 +2615,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2613 | int firstwrite=0; | 2615 | int firstwrite=0; |
2614 | 2616 | ||
2615 | pr_debug("adding bi b#%llu to stripe s#%llu\n", | 2617 | pr_debug("adding bi b#%llu to stripe s#%llu\n", |
2616 | (unsigned long long)bi->bi_sector, | 2618 | (unsigned long long)bi->bi_iter.bi_sector, |
2617 | (unsigned long long)sh->sector); | 2619 | (unsigned long long)sh->sector); |
2618 | 2620 | ||
2619 | /* | 2621 | /* |
@@ -2631,12 +2633,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2631 | firstwrite = 1; | 2633 | firstwrite = 1; |
2632 | } else | 2634 | } else |
2633 | bip = &sh->dev[dd_idx].toread; | 2635 | bip = &sh->dev[dd_idx].toread; |
2634 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { | 2636 | while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { |
2635 | if (bio_end_sector(*bip) > bi->bi_sector) | 2637 | if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) |
2636 | goto overlap; | 2638 | goto overlap; |
2637 | bip = & (*bip)->bi_next; | 2639 | bip = & (*bip)->bi_next; |
2638 | } | 2640 | } |
2639 | if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) | 2641 | if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) |
2640 | goto overlap; | 2642 | goto overlap; |
2641 | 2643 | ||
2642 | BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); | 2644 | BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); |
@@ -2650,7 +2652,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2650 | sector_t sector = sh->dev[dd_idx].sector; | 2652 | sector_t sector = sh->dev[dd_idx].sector; |
2651 | for (bi=sh->dev[dd_idx].towrite; | 2653 | for (bi=sh->dev[dd_idx].towrite; |
2652 | sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && | 2654 | sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && |
2653 | bi && bi->bi_sector <= sector; | 2655 | bi && bi->bi_iter.bi_sector <= sector; |
2654 | bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { | 2656 | bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { |
2655 | if (bio_end_sector(bi) >= sector) | 2657 | if (bio_end_sector(bi) >= sector) |
2656 | sector = bio_end_sector(bi); | 2658 | sector = bio_end_sector(bi); |
@@ -2660,7 +2662,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2660 | } | 2662 | } |
2661 | 2663 | ||
2662 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | 2664 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", |
2663 | (unsigned long long)(*bip)->bi_sector, | 2665 | (unsigned long long)(*bip)->bi_iter.bi_sector, |
2664 | (unsigned long long)sh->sector, dd_idx); | 2666 | (unsigned long long)sh->sector, dd_idx); |
2665 | spin_unlock_irq(&sh->stripe_lock); | 2667 | spin_unlock_irq(&sh->stripe_lock); |
2666 | 2668 | ||
@@ -2735,7 +2737,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2735 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 2737 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
2736 | wake_up(&conf->wait_for_overlap); | 2738 | wake_up(&conf->wait_for_overlap); |
2737 | 2739 | ||
2738 | while (bi && bi->bi_sector < | 2740 | while (bi && bi->bi_iter.bi_sector < |
2739 | sh->dev[i].sector + STRIPE_SECTORS) { | 2741 | sh->dev[i].sector + STRIPE_SECTORS) { |
2740 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | 2742 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); |
2741 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2743 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
@@ -2754,7 +2756,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2754 | bi = sh->dev[i].written; | 2756 | bi = sh->dev[i].written; |
2755 | sh->dev[i].written = NULL; | 2757 | sh->dev[i].written = NULL; |
2756 | if (bi) bitmap_end = 1; | 2758 | if (bi) bitmap_end = 1; |
2757 | while (bi && bi->bi_sector < | 2759 | while (bi && bi->bi_iter.bi_sector < |
2758 | sh->dev[i].sector + STRIPE_SECTORS) { | 2760 | sh->dev[i].sector + STRIPE_SECTORS) { |
2759 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | 2761 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); |
2760 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2762 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
@@ -2778,7 +2780,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2778 | spin_unlock_irq(&sh->stripe_lock); | 2780 | spin_unlock_irq(&sh->stripe_lock); |
2779 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 2781 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
2780 | wake_up(&conf->wait_for_overlap); | 2782 | wake_up(&conf->wait_for_overlap); |
2781 | while (bi && bi->bi_sector < | 2783 | while (bi && bi->bi_iter.bi_sector < |
2782 | sh->dev[i].sector + STRIPE_SECTORS) { | 2784 | sh->dev[i].sector + STRIPE_SECTORS) { |
2783 | struct bio *nextbi = | 2785 | struct bio *nextbi = |
2784 | r5_next_bio(bi, sh->dev[i].sector); | 2786 | r5_next_bio(bi, sh->dev[i].sector); |
@@ -3002,7 +3004,7 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
3002 | clear_bit(R5_UPTODATE, &dev->flags); | 3004 | clear_bit(R5_UPTODATE, &dev->flags); |
3003 | wbi = dev->written; | 3005 | wbi = dev->written; |
3004 | dev->written = NULL; | 3006 | dev->written = NULL; |
3005 | while (wbi && wbi->bi_sector < | 3007 | while (wbi && wbi->bi_iter.bi_sector < |
3006 | dev->sector + STRIPE_SECTORS) { | 3008 | dev->sector + STRIPE_SECTORS) { |
3007 | wbi2 = r5_next_bio(wbi, dev->sector); | 3009 | wbi2 = r5_next_bio(wbi, dev->sector); |
3008 | if (!raid5_dec_bi_active_stripes(wbi)) { | 3010 | if (!raid5_dec_bi_active_stripes(wbi)) { |
@@ -3608,7 +3610,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3608 | */ | 3610 | */ |
3609 | set_bit(R5_Insync, &dev->flags); | 3611 | set_bit(R5_Insync, &dev->flags); |
3610 | 3612 | ||
3611 | if (rdev && test_bit(R5_WriteError, &dev->flags)) { | 3613 | if (test_bit(R5_WriteError, &dev->flags)) { |
3612 | /* This flag does not apply to '.replacement' | 3614 | /* This flag does not apply to '.replacement' |
3613 | * only to .rdev, so make sure to check that*/ | 3615 | * only to .rdev, so make sure to check that*/ |
3614 | struct md_rdev *rdev2 = rcu_dereference( | 3616 | struct md_rdev *rdev2 = rcu_dereference( |
@@ -3621,7 +3623,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3621 | } else | 3623 | } else |
3622 | clear_bit(R5_WriteError, &dev->flags); | 3624 | clear_bit(R5_WriteError, &dev->flags); |
3623 | } | 3625 | } |
3624 | if (rdev && test_bit(R5_MadeGood, &dev->flags)) { | 3626 | if (test_bit(R5_MadeGood, &dev->flags)) { |
3625 | /* This flag does not apply to '.replacement' | 3627 | /* This flag does not apply to '.replacement' |
3626 | * only to .rdev, so make sure to check that*/ | 3628 | * only to .rdev, so make sure to check that*/ |
3627 | struct md_rdev *rdev2 = rcu_dereference( | 3629 | struct md_rdev *rdev2 = rcu_dereference( |
@@ -4094,7 +4096,7 @@ static int raid5_mergeable_bvec(struct request_queue *q, | |||
4094 | 4096 | ||
4095 | static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) | 4097 | static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) |
4096 | { | 4098 | { |
4097 | sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); | 4099 | sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); |
4098 | unsigned int chunk_sectors = mddev->chunk_sectors; | 4100 | unsigned int chunk_sectors = mddev->chunk_sectors; |
4099 | unsigned int bio_sectors = bio_sectors(bio); | 4101 | unsigned int bio_sectors = bio_sectors(bio); |
4100 | 4102 | ||
@@ -4231,9 +4233,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
4231 | /* | 4233 | /* |
4232 | * compute position | 4234 | * compute position |
4233 | */ | 4235 | */ |
4234 | align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, | 4236 | align_bi->bi_iter.bi_sector = |
4235 | 0, | 4237 | raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, |
4236 | &dd_idx, NULL); | 4238 | 0, &dd_idx, NULL); |
4237 | 4239 | ||
4238 | end_sector = bio_end_sector(align_bi); | 4240 | end_sector = bio_end_sector(align_bi); |
4239 | rcu_read_lock(); | 4241 | rcu_read_lock(); |
@@ -4258,7 +4260,8 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
4258 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 4260 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
4259 | 4261 | ||
4260 | if (!bio_fits_rdev(align_bi) || | 4262 | if (!bio_fits_rdev(align_bi) || |
4261 | is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), | 4263 | is_badblock(rdev, align_bi->bi_iter.bi_sector, |
4264 | bio_sectors(align_bi), | ||
4262 | &first_bad, &bad_sectors)) { | 4265 | &first_bad, &bad_sectors)) { |
4263 | /* too big in some way, or has a known bad block */ | 4266 | /* too big in some way, or has a known bad block */ |
4264 | bio_put(align_bi); | 4267 | bio_put(align_bi); |
@@ -4267,7 +4270,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
4267 | } | 4270 | } |
4268 | 4271 | ||
4269 | /* No reshape active, so we can trust rdev->data_offset */ | 4272 | /* No reshape active, so we can trust rdev->data_offset */ |
4270 | align_bi->bi_sector += rdev->data_offset; | 4273 | align_bi->bi_iter.bi_sector += rdev->data_offset; |
4271 | 4274 | ||
4272 | spin_lock_irq(&conf->device_lock); | 4275 | spin_lock_irq(&conf->device_lock); |
4273 | wait_event_lock_irq(conf->wait_for_stripe, | 4276 | wait_event_lock_irq(conf->wait_for_stripe, |
@@ -4279,7 +4282,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
4279 | if (mddev->gendisk) | 4282 | if (mddev->gendisk) |
4280 | trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), | 4283 | trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), |
4281 | align_bi, disk_devt(mddev->gendisk), | 4284 | align_bi, disk_devt(mddev->gendisk), |
4282 | raid_bio->bi_sector); | 4285 | raid_bio->bi_iter.bi_sector); |
4283 | generic_make_request(align_bi); | 4286 | generic_make_request(align_bi); |
4284 | return 1; | 4287 | return 1; |
4285 | } else { | 4288 | } else { |
@@ -4462,8 +4465,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) | |||
4462 | /* Skip discard while reshape is happening */ | 4465 | /* Skip discard while reshape is happening */ |
4463 | return; | 4466 | return; |
4464 | 4467 | ||
4465 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | 4468 | logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); |
4466 | last_sector = bi->bi_sector + (bi->bi_size>>9); | 4469 | last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); |
4467 | 4470 | ||
4468 | bi->bi_next = NULL; | 4471 | bi->bi_next = NULL; |
4469 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | 4472 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ |
@@ -4567,7 +4570,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4567 | return; | 4570 | return; |
4568 | } | 4571 | } |
4569 | 4572 | ||
4570 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | 4573 | logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); |
4571 | last_sector = bio_end_sector(bi); | 4574 | last_sector = bio_end_sector(bi); |
4572 | bi->bi_next = NULL; | 4575 | bi->bi_next = NULL; |
4573 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | 4576 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ |
@@ -5051,7 +5054,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
5051 | int remaining; | 5054 | int remaining; |
5052 | int handled = 0; | 5055 | int handled = 0; |
5053 | 5056 | ||
5054 | logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | 5057 | logical_sector = raid_bio->bi_iter.bi_sector & |
5058 | ~((sector_t)STRIPE_SECTORS-1); | ||
5055 | sector = raid5_compute_sector(conf, logical_sector, | 5059 | sector = raid5_compute_sector(conf, logical_sector, |
5056 | 0, &dd_idx, NULL); | 5060 | 0, &dd_idx, NULL); |
5057 | last_sector = bio_end_sector(raid_bio); | 5061 | last_sector = bio_end_sector(raid_bio); |
@@ -5510,23 +5514,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) | |||
5510 | return sectors * (raid_disks - conf->max_degraded); | 5514 | return sectors * (raid_disks - conf->max_degraded); |
5511 | } | 5515 | } |
5512 | 5516 | ||
5517 | static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) | ||
5518 | { | ||
5519 | safe_put_page(percpu->spare_page); | ||
5520 | kfree(percpu->scribble); | ||
5521 | percpu->spare_page = NULL; | ||
5522 | percpu->scribble = NULL; | ||
5523 | } | ||
5524 | |||
5525 | static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) | ||
5526 | { | ||
5527 | if (conf->level == 6 && !percpu->spare_page) | ||
5528 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
5529 | if (!percpu->scribble) | ||
5530 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
5531 | |||
5532 | if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { | ||
5533 | free_scratch_buffer(conf, percpu); | ||
5534 | return -ENOMEM; | ||
5535 | } | ||
5536 | |||
5537 | return 0; | ||
5538 | } | ||
5539 | |||
5513 | static void raid5_free_percpu(struct r5conf *conf) | 5540 | static void raid5_free_percpu(struct r5conf *conf) |
5514 | { | 5541 | { |
5515 | struct raid5_percpu *percpu; | ||
5516 | unsigned long cpu; | 5542 | unsigned long cpu; |
5517 | 5543 | ||
5518 | if (!conf->percpu) | 5544 | if (!conf->percpu) |
5519 | return; | 5545 | return; |
5520 | 5546 | ||
5521 | get_online_cpus(); | ||
5522 | for_each_possible_cpu(cpu) { | ||
5523 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
5524 | safe_put_page(percpu->spare_page); | ||
5525 | kfree(percpu->scribble); | ||
5526 | } | ||
5527 | #ifdef CONFIG_HOTPLUG_CPU | 5547 | #ifdef CONFIG_HOTPLUG_CPU |
5528 | unregister_cpu_notifier(&conf->cpu_notify); | 5548 | unregister_cpu_notifier(&conf->cpu_notify); |
5529 | #endif | 5549 | #endif |
5550 | |||
5551 | get_online_cpus(); | ||
5552 | for_each_possible_cpu(cpu) | ||
5553 | free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); | ||
5530 | put_online_cpus(); | 5554 | put_online_cpus(); |
5531 | 5555 | ||
5532 | free_percpu(conf->percpu); | 5556 | free_percpu(conf->percpu); |
@@ -5553,15 +5577,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | |||
5553 | switch (action) { | 5577 | switch (action) { |
5554 | case CPU_UP_PREPARE: | 5578 | case CPU_UP_PREPARE: |
5555 | case CPU_UP_PREPARE_FROZEN: | 5579 | case CPU_UP_PREPARE_FROZEN: |
5556 | if (conf->level == 6 && !percpu->spare_page) | 5580 | if (alloc_scratch_buffer(conf, percpu)) { |
5557 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
5558 | if (!percpu->scribble) | ||
5559 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
5560 | |||
5561 | if (!percpu->scribble || | ||
5562 | (conf->level == 6 && !percpu->spare_page)) { | ||
5563 | safe_put_page(percpu->spare_page); | ||
5564 | kfree(percpu->scribble); | ||
5565 | pr_err("%s: failed memory allocation for cpu%ld\n", | 5581 | pr_err("%s: failed memory allocation for cpu%ld\n", |
5566 | __func__, cpu); | 5582 | __func__, cpu); |
5567 | return notifier_from_errno(-ENOMEM); | 5583 | return notifier_from_errno(-ENOMEM); |
@@ -5569,10 +5585,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | |||
5569 | break; | 5585 | break; |
5570 | case CPU_DEAD: | 5586 | case CPU_DEAD: |
5571 | case CPU_DEAD_FROZEN: | 5587 | case CPU_DEAD_FROZEN: |
5572 | safe_put_page(percpu->spare_page); | 5588 | free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); |
5573 | kfree(percpu->scribble); | ||
5574 | percpu->spare_page = NULL; | ||
5575 | percpu->scribble = NULL; | ||
5576 | break; | 5589 | break; |
5577 | default: | 5590 | default: |
5578 | break; | 5591 | break; |
@@ -5584,40 +5597,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | |||
5584 | static int raid5_alloc_percpu(struct r5conf *conf) | 5597 | static int raid5_alloc_percpu(struct r5conf *conf) |
5585 | { | 5598 | { |
5586 | unsigned long cpu; | 5599 | unsigned long cpu; |
5587 | struct page *spare_page; | 5600 | int err = 0; |
5588 | struct raid5_percpu __percpu *allcpus; | ||
5589 | void *scribble; | ||
5590 | int err; | ||
5591 | 5601 | ||
5592 | allcpus = alloc_percpu(struct raid5_percpu); | 5602 | conf->percpu = alloc_percpu(struct raid5_percpu); |
5593 | if (!allcpus) | 5603 | if (!conf->percpu) |
5594 | return -ENOMEM; | 5604 | return -ENOMEM; |
5595 | conf->percpu = allcpus; | 5605 | |
5606 | #ifdef CONFIG_HOTPLUG_CPU | ||
5607 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
5608 | conf->cpu_notify.priority = 0; | ||
5609 | err = register_cpu_notifier(&conf->cpu_notify); | ||
5610 | if (err) | ||
5611 | return err; | ||
5612 | #endif | ||
5596 | 5613 | ||
5597 | get_online_cpus(); | 5614 | get_online_cpus(); |
5598 | err = 0; | ||
5599 | for_each_present_cpu(cpu) { | 5615 | for_each_present_cpu(cpu) { |
5600 | if (conf->level == 6) { | 5616 | err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); |
5601 | spare_page = alloc_page(GFP_KERNEL); | 5617 | if (err) { |
5602 | if (!spare_page) { | 5618 | pr_err("%s: failed memory allocation for cpu%ld\n", |
5603 | err = -ENOMEM; | 5619 | __func__, cpu); |
5604 | break; | ||
5605 | } | ||
5606 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; | ||
5607 | } | ||
5608 | scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
5609 | if (!scribble) { | ||
5610 | err = -ENOMEM; | ||
5611 | break; | 5620 | break; |
5612 | } | 5621 | } |
5613 | per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; | ||
5614 | } | 5622 | } |
5615 | #ifdef CONFIG_HOTPLUG_CPU | ||
5616 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
5617 | conf->cpu_notify.priority = 0; | ||
5618 | if (err == 0) | ||
5619 | err = register_cpu_notifier(&conf->cpu_notify); | ||
5620 | #endif | ||
5621 | put_online_cpus(); | 5623 | put_online_cpus(); |
5622 | 5624 | ||
5623 | return err; | 5625 | return err; |
@@ -6099,6 +6101,7 @@ static int run(struct mddev *mddev) | |||
6099 | blk_queue_io_min(mddev->queue, chunk_size); | 6101 | blk_queue_io_min(mddev->queue, chunk_size); |
6100 | blk_queue_io_opt(mddev->queue, chunk_size * | 6102 | blk_queue_io_opt(mddev->queue, chunk_size * |
6101 | (conf->raid_disks - conf->max_degraded)); | 6103 | (conf->raid_disks - conf->max_degraded)); |
6104 | mddev->queue->limits.raid_partial_stripes_expensive = 1; | ||
6102 | /* | 6105 | /* |
6103 | * We can only discard a whole stripe. It doesn't make sense to | 6106 | * We can only discard a whole stripe. It doesn't make sense to |
6104 | * discard data disk but write parity disk | 6107 | * discard data disk but write parity disk |