diff options
| -rw-r--r-- | block/bfq-iosched.c | 107 | ||||
| -rw-r--r-- | block/blk-core.c | 11 | ||||
| -rw-r--r-- | block/blk-wbt.c | 10 | ||||
| -rw-r--r-- | drivers/md/bcache/alloc.c | 4 | ||||
| -rw-r--r-- | drivers/md/bcache/bcache.h | 9 | ||||
| -rw-r--r-- | drivers/md/bcache/btree.c | 9 | ||||
| -rw-r--r-- | drivers/md/bcache/journal.c | 52 | ||||
| -rw-r--r-- | drivers/md/bcache/super.c | 25 | ||||
| -rw-r--r-- | drivers/md/bcache/sysfs.c | 34 | ||||
| -rw-r--r-- | drivers/md/bcache/util.h | 2 | ||||
| -rw-r--r-- | drivers/md/bcache/writeback.c | 9 | ||||
| -rw-r--r-- | drivers/md/bcache/writeback.h | 3 |
12 files changed, 212 insertions, 63 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 47e6ec7427c4..aeca22d91101 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
| @@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
| 3823 | } | 3823 | } |
| 3824 | 3824 | ||
| 3825 | /* | 3825 | /* |
| 3826 | * We exploit the bfq_finish_request hook to decrement | 3826 | * We exploit the bfq_finish_requeue_request hook to |
| 3827 | * rq_in_driver, but bfq_finish_request will not be | 3827 | * decrement rq_in_driver, but |
| 3828 | * invoked on this request. So, to avoid unbalance, | 3828 | * bfq_finish_requeue_request will not be invoked on |
| 3829 | * just start this request, without incrementing | 3829 | * this request. So, to avoid unbalance, just start |
| 3830 | * rq_in_driver. As a negative consequence, | 3830 | * this request, without incrementing rq_in_driver. As |
| 3831 | * rq_in_driver is deceptively lower than it should be | 3831 | * a negative consequence, rq_in_driver is deceptively |
| 3832 | * while this request is in service. This may cause | 3832 | * lower than it should be while this request is in |
| 3833 | * bfq_schedule_dispatch to be invoked uselessly. | 3833 | * service. This may cause bfq_schedule_dispatch to be |
| 3834 | * invoked uselessly. | ||
| 3834 | * | 3835 | * |
| 3835 | * As for implementing an exact solution, the | 3836 | * As for implementing an exact solution, the |
| 3836 | * bfq_finish_request hook, if defined, is probably | 3837 | * bfq_finish_requeue_request hook, if defined, is |
| 3837 | * invoked also on this request. So, by exploiting | 3838 | * probably invoked also on this request. So, by |
| 3838 | * this hook, we could 1) increment rq_in_driver here, | 3839 | * exploiting this hook, we could 1) increment |
| 3839 | * and 2) decrement it in bfq_finish_request. Such a | 3840 | * rq_in_driver here, and 2) decrement it in |
| 3840 | * solution would let the value of the counter be | 3841 | * bfq_finish_requeue_request. Such a solution would |
| 3841 | * always accurate, but it would entail using an extra | 3842 | * let the value of the counter be always accurate, |
| 3842 | * interface function. This cost seems higher than the | 3843 | * but it would entail using an extra interface |
| 3843 | * benefit, being the frequency of non-elevator-private | 3844 | * function. This cost seems higher than the benefit, |
| 3845 | * being the frequency of non-elevator-private | ||
| 3844 | * requests very low. | 3846 | * requests very low. |
| 3845 | */ | 3847 | */ |
| 3846 | goto start_rq; | 3848 | goto start_rq; |
| @@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q, | |||
| 4515 | unsigned int cmd_flags) {} | 4517 | unsigned int cmd_flags) {} |
| 4516 | #endif | 4518 | #endif |
| 4517 | 4519 | ||
| 4520 | static void bfq_prepare_request(struct request *rq, struct bio *bio); | ||
| 4521 | |||
| 4518 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | 4522 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, |
| 4519 | bool at_head) | 4523 | bool at_head) |
| 4520 | { | 4524 | { |
| @@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | |||
| 4541 | else | 4545 | else |
| 4542 | list_add_tail(&rq->queuelist, &bfqd->dispatch); | 4546 | list_add_tail(&rq->queuelist, &bfqd->dispatch); |
| 4543 | } else { | 4547 | } else { |
| 4548 | if (WARN_ON_ONCE(!bfqq)) { | ||
| 4549 | /* | ||
| 4550 | * This should never happen. Most likely rq is | ||
| 4551 | * a requeued regular request, being | ||
| 4552 | * re-inserted without being first | ||
| 4553 | * re-prepared. Do a prepare, to avoid | ||
| 4554 | * failure. | ||
| 4555 | */ | ||
| 4556 | bfq_prepare_request(rq, rq->bio); | ||
| 4557 | bfqq = RQ_BFQQ(rq); | ||
| 4558 | } | ||
| 4559 | |||
| 4544 | idle_timer_disabled = __bfq_insert_request(bfqd, rq); | 4560 | idle_timer_disabled = __bfq_insert_request(bfqd, rq); |
| 4545 | /* | 4561 | /* |
| 4546 | * Update bfqq, because, if a queue merge has occurred | 4562 | * Update bfqq, because, if a queue merge has occurred |
| @@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) | |||
| 4697 | bfq_schedule_dispatch(bfqd); | 4713 | bfq_schedule_dispatch(bfqd); |
| 4698 | } | 4714 | } |
| 4699 | 4715 | ||
| 4700 | static void bfq_finish_request_body(struct bfq_queue *bfqq) | 4716 | static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) |
| 4701 | { | 4717 | { |
| 4702 | bfqq->allocated--; | 4718 | bfqq->allocated--; |
| 4703 | 4719 | ||
| 4704 | bfq_put_queue(bfqq); | 4720 | bfq_put_queue(bfqq); |
| 4705 | } | 4721 | } |
| 4706 | 4722 | ||
| 4707 | static void bfq_finish_request(struct request *rq) | 4723 | /* |
| 4724 | * Handle either a requeue or a finish for rq. The things to do are | ||
| 4725 | * the same in both cases: all references to rq are to be dropped. In | ||
| 4726 | * particular, rq is considered completed from the point of view of | ||
| 4727 | * the scheduler. | ||
| 4728 | */ | ||
| 4729 | static void bfq_finish_requeue_request(struct request *rq) | ||
| 4708 | { | 4730 | { |
| 4709 | struct bfq_queue *bfqq; | 4731 | struct bfq_queue *bfqq = RQ_BFQQ(rq); |
| 4710 | struct bfq_data *bfqd; | 4732 | struct bfq_data *bfqd; |
| 4711 | 4733 | ||
| 4712 | if (!rq->elv.icq) | 4734 | /* |
| 4735 | * Requeue and finish hooks are invoked in blk-mq without | ||
| 4736 | * checking whether the involved request is actually still | ||
| 4737 | * referenced in the scheduler. To handle this fact, the | ||
| 4738 | * following two checks make this function exit in case of | ||
| 4739 | * spurious invocations, for which there is nothing to do. | ||
| 4740 | * | ||
| 4741 | * First, check whether rq has nothing to do with an elevator. | ||
| 4742 | */ | ||
| 4743 | if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) | ||
| 4744 | return; | ||
| 4745 | |||
| 4746 | /* | ||
| 4747 | * rq either is not associated with any icq, or is an already | ||
| 4748 | * requeued request that has not (yet) been re-inserted into | ||
| 4749 | * a bfq_queue. | ||
| 4750 | */ | ||
| 4751 | if (!rq->elv.icq || !bfqq) | ||
| 4713 | return; | 4752 | return; |
| 4714 | 4753 | ||
| 4715 | bfqq = RQ_BFQQ(rq); | ||
| 4716 | bfqd = bfqq->bfqd; | 4754 | bfqd = bfqq->bfqd; |
| 4717 | 4755 | ||
| 4718 | if (rq->rq_flags & RQF_STARTED) | 4756 | if (rq->rq_flags & RQF_STARTED) |
| @@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq) | |||
| 4727 | spin_lock_irqsave(&bfqd->lock, flags); | 4765 | spin_lock_irqsave(&bfqd->lock, flags); |
| 4728 | 4766 | ||
| 4729 | bfq_completed_request(bfqq, bfqd); | 4767 | bfq_completed_request(bfqq, bfqd); |
| 4730 | bfq_finish_request_body(bfqq); | 4768 | bfq_finish_requeue_request_body(bfqq); |
| 4731 | 4769 | ||
| 4732 | spin_unlock_irqrestore(&bfqd->lock, flags); | 4770 | spin_unlock_irqrestore(&bfqd->lock, flags); |
| 4733 | } else { | 4771 | } else { |
| 4734 | /* | 4772 | /* |
| 4735 | * Request rq may be still/already in the scheduler, | 4773 | * Request rq may be still/already in the scheduler, |
| 4736 | * in which case we need to remove it. And we cannot | 4774 | * in which case we need to remove it (this should |
| 4775 | * never happen in case of requeue). And we cannot | ||
| 4737 | * defer such a check and removal, to avoid | 4776 | * defer such a check and removal, to avoid |
| 4738 | * inconsistencies in the time interval from the end | 4777 | * inconsistencies in the time interval from the end |
| 4739 | * of this function to the start of the deferred work. | 4778 | * of this function to the start of the deferred work. |
| @@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq) | |||
| 4748 | bfqg_stats_update_io_remove(bfqq_group(bfqq), | 4787 | bfqg_stats_update_io_remove(bfqq_group(bfqq), |
| 4749 | rq->cmd_flags); | 4788 | rq->cmd_flags); |
| 4750 | } | 4789 | } |
| 4751 | bfq_finish_request_body(bfqq); | 4790 | bfq_finish_requeue_request_body(bfqq); |
| 4752 | } | 4791 | } |
| 4753 | 4792 | ||
| 4793 | /* | ||
| 4794 | * Reset private fields. In case of a requeue, this allows | ||
| 4795 | * this function to correctly do nothing if it is spuriously | ||
| 4796 | * invoked again on this same request (see the check at the | ||
| 4797 | * beginning of the function). Probably, a better general | ||
| 4798 | * design would be to prevent blk-mq from invoking the requeue | ||
| 4799 | * or finish hooks of an elevator, for a request that is not | ||
| 4800 | * referred by that elevator. | ||
| 4801 | * | ||
| 4802 | * Resetting the following fields would break the | ||
| 4803 | * request-insertion logic if rq is re-inserted into a bfq | ||
| 4804 | * internal queue, without a re-preparation. Here we assume | ||
| 4805 | * that re-insertions of requeued requests, without | ||
| 4806 | * re-preparation, can happen only for pass_through or at_head | ||
| 4807 | * requests (which are not re-inserted into bfq internal | ||
| 4808 | * queues). | ||
| 4809 | */ | ||
| 4754 | rq->elv.priv[0] = NULL; | 4810 | rq->elv.priv[0] = NULL; |
| 4755 | rq->elv.priv[1] = NULL; | 4811 | rq->elv.priv[1] = NULL; |
| 4756 | } | 4812 | } |
| @@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = { | |||
| 5426 | .ops.mq = { | 5482 | .ops.mq = { |
| 5427 | .limit_depth = bfq_limit_depth, | 5483 | .limit_depth = bfq_limit_depth, |
| 5428 | .prepare_request = bfq_prepare_request, | 5484 | .prepare_request = bfq_prepare_request, |
| 5429 | .finish_request = bfq_finish_request, | 5485 | .requeue_request = bfq_finish_requeue_request, |
| 5486 | .finish_request = bfq_finish_requeue_request, | ||
| 5430 | .exit_icq = bfq_exit_icq, | 5487 | .exit_icq = bfq_exit_icq, |
| 5431 | .insert_requests = bfq_insert_requests, | 5488 | .insert_requests = bfq_insert_requests, |
| 5432 | .dispatch_request = bfq_dispatch_request, | 5489 | .dispatch_request = bfq_dispatch_request, |
diff --git a/block/blk-core.c b/block/blk-core.c index d0d104268f1a..2d1a7bbe0634 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
| @@ -34,6 +34,7 @@ | |||
| 34 | #include <linux/pm_runtime.h> | 34 | #include <linux/pm_runtime.h> |
| 35 | #include <linux/blk-cgroup.h> | 35 | #include <linux/blk-cgroup.h> |
| 36 | #include <linux/debugfs.h> | 36 | #include <linux/debugfs.h> |
| 37 | #include <linux/bpf.h> | ||
| 37 | 38 | ||
| 38 | #define CREATE_TRACE_POINTS | 39 | #define CREATE_TRACE_POINTS |
| 39 | #include <trace/events/block.h> | 40 | #include <trace/events/block.h> |
| @@ -2083,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) | |||
| 2083 | return false; | 2084 | return false; |
| 2084 | } | 2085 | } |
| 2085 | 2086 | ||
| 2087 | static noinline int should_fail_bio(struct bio *bio) | ||
| 2088 | { | ||
| 2089 | if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) | ||
| 2090 | return -EIO; | ||
| 2091 | return 0; | ||
| 2092 | } | ||
| 2093 | ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); | ||
| 2094 | |||
| 2086 | /* | 2095 | /* |
| 2087 | * Remap block n of partition p to block n+start(p) of the disk. | 2096 | * Remap block n of partition p to block n+start(p) of the disk. |
| 2088 | */ | 2097 | */ |
| @@ -2174,7 +2183,7 @@ generic_make_request_checks(struct bio *bio) | |||
| 2174 | if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) | 2183 | if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) |
| 2175 | goto not_supported; | 2184 | goto not_supported; |
| 2176 | 2185 | ||
| 2177 | if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) | 2186 | if (should_fail_bio(bio)) |
| 2178 | goto end_io; | 2187 | goto end_io; |
| 2179 | 2188 | ||
| 2180 | if (!bio->bi_partno) { | 2189 | if (!bio->bi_partno) { |
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index ae8de9780085..f92fc84b5e2c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c | |||
| @@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q) | |||
| 697 | 697 | ||
| 698 | static int wbt_data_dir(const struct request *rq) | 698 | static int wbt_data_dir(const struct request *rq) |
| 699 | { | 699 | { |
| 700 | return rq_data_dir(rq); | 700 | const int op = req_op(rq); |
| 701 | |||
| 702 | if (op == REQ_OP_READ) | ||
| 703 | return READ; | ||
| 704 | else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH) | ||
| 705 | return WRITE; | ||
| 706 | |||
| 707 | /* don't account */ | ||
| 708 | return -1; | ||
| 701 | } | 709 | } |
| 702 | 710 | ||
| 703 | int wbt_init(struct request_queue *q) | 711 | int wbt_init(struct request_queue *q) |
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6cc6c0f9c3a9..458e1d38577d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
| @@ -287,8 +287,10 @@ do { \ | |||
| 287 | break; \ | 287 | break; \ |
| 288 | \ | 288 | \ |
| 289 | mutex_unlock(&(ca)->set->bucket_lock); \ | 289 | mutex_unlock(&(ca)->set->bucket_lock); \ |
| 290 | if (kthread_should_stop()) \ | 290 | if (kthread_should_stop()) { \ |
| 291 | set_current_state(TASK_RUNNING); \ | ||
| 291 | return 0; \ | 292 | return 0; \ |
| 293 | } \ | ||
| 292 | \ | 294 | \ |
| 293 | schedule(); \ | 295 | schedule(); \ |
| 294 | mutex_lock(&(ca)->set->bucket_lock); \ | 296 | mutex_lock(&(ca)->set->bucket_lock); \ |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5e2d4e80198e..12e5197f186c 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
| @@ -658,10 +658,15 @@ struct cache_set { | |||
| 658 | atomic_long_t writeback_keys_done; | 658 | atomic_long_t writeback_keys_done; |
| 659 | atomic_long_t writeback_keys_failed; | 659 | atomic_long_t writeback_keys_failed; |
| 660 | 660 | ||
| 661 | atomic_long_t reclaim; | ||
| 662 | atomic_long_t flush_write; | ||
| 663 | atomic_long_t retry_flush_write; | ||
| 664 | |||
| 661 | enum { | 665 | enum { |
| 662 | ON_ERROR_UNREGISTER, | 666 | ON_ERROR_UNREGISTER, |
| 663 | ON_ERROR_PANIC, | 667 | ON_ERROR_PANIC, |
| 664 | } on_error; | 668 | } on_error; |
| 669 | #define DEFAULT_IO_ERROR_LIMIT 8 | ||
| 665 | unsigned error_limit; | 670 | unsigned error_limit; |
| 666 | unsigned error_decay; | 671 | unsigned error_decay; |
| 667 | 672 | ||
| @@ -675,6 +680,8 @@ struct cache_set { | |||
| 675 | 680 | ||
| 676 | #define BUCKET_HASH_BITS 12 | 681 | #define BUCKET_HASH_BITS 12 |
| 677 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; | 682 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; |
| 683 | |||
| 684 | DECLARE_HEAP(struct btree *, flush_btree); | ||
| 678 | }; | 685 | }; |
| 679 | 686 | ||
| 680 | struct bbio { | 687 | struct bbio { |
| @@ -917,7 +924,7 @@ void bcache_write_super(struct cache_set *); | |||
| 917 | 924 | ||
| 918 | int bch_flash_dev_create(struct cache_set *c, uint64_t size); | 925 | int bch_flash_dev_create(struct cache_set *c, uint64_t size); |
| 919 | 926 | ||
| 920 | int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); | 927 | int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *); |
| 921 | void bch_cached_dev_detach(struct cached_dev *); | 928 | void bch_cached_dev_detach(struct cached_dev *); |
| 922 | void bch_cached_dev_run(struct cached_dev *); | 929 | void bch_cached_dev_run(struct cached_dev *); |
| 923 | void bcache_device_stop(struct bcache_device *); | 930 | void bcache_device_stop(struct bcache_device *); |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index bf3a48aa9a9a..fad9fe8817eb 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
| @@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c) | |||
| 1869 | */ | 1869 | */ |
| 1870 | for_each_cache(ca, c, i) { | 1870 | for_each_cache(ca, c, i) { |
| 1871 | for_each_bucket(b, ca) { | 1871 | for_each_bucket(b, ca) { |
| 1872 | if (fifo_full(&ca->free[RESERVE_PRIO])) | 1872 | if (fifo_full(&ca->free[RESERVE_PRIO]) && |
| 1873 | fifo_full(&ca->free[RESERVE_BTREE])) | ||
| 1873 | break; | 1874 | break; |
| 1874 | 1875 | ||
| 1875 | if (bch_can_invalidate_bucket(ca, b) && | 1876 | if (bch_can_invalidate_bucket(ca, b) && |
| 1876 | !GC_MARK(b)) { | 1877 | !GC_MARK(b)) { |
| 1877 | __bch_invalidate_one_bucket(ca, b); | 1878 | __bch_invalidate_one_bucket(ca, b); |
| 1878 | fifo_push(&ca->free[RESERVE_PRIO], | 1879 | if (!fifo_push(&ca->free[RESERVE_PRIO], |
| 1879 | b - ca->buckets); | 1880 | b - ca->buckets)) |
| 1881 | fifo_push(&ca->free[RESERVE_BTREE], | ||
| 1882 | b - ca->buckets); | ||
| 1880 | } | 1883 | } |
| 1881 | } | 1884 | } |
| 1882 | } | 1885 | } |
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a87165c1d8e5..1b736b860739 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
| @@ -368,6 +368,12 @@ err: | |||
| 368 | } | 368 | } |
| 369 | 369 | ||
| 370 | /* Journalling */ | 370 | /* Journalling */ |
| 371 | #define journal_max_cmp(l, r) \ | ||
| 372 | (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ | ||
| 373 | fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) | ||
| 374 | #define journal_min_cmp(l, r) \ | ||
| 375 | (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ | ||
| 376 | fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) | ||
| 371 | 377 | ||
| 372 | static void btree_flush_write(struct cache_set *c) | 378 | static void btree_flush_write(struct cache_set *c) |
| 373 | { | 379 | { |
| @@ -375,28 +381,41 @@ static void btree_flush_write(struct cache_set *c) | |||
| 375 | * Try to find the btree node with that references the oldest journal | 381 | * Try to find the btree node with that references the oldest journal |
| 376 | * entry, best is our current candidate and is locked if non NULL: | 382 | * entry, best is our current candidate and is locked if non NULL: |
| 377 | */ | 383 | */ |
| 378 | struct btree *b, *best; | 384 | struct btree *b; |
| 379 | unsigned i; | 385 | int i; |
| 386 | |||
| 387 | atomic_long_inc(&c->flush_write); | ||
| 388 | |||
| 380 | retry: | 389 | retry: |
| 381 | best = NULL; | 390 | spin_lock(&c->journal.lock); |
| 382 | 391 | if (heap_empty(&c->flush_btree)) { | |
| 383 | for_each_cached_btree(b, c, i) | 392 | for_each_cached_btree(b, c, i) |
| 384 | if (btree_current_write(b)->journal) { | 393 | if (btree_current_write(b)->journal) { |
| 385 | if (!best) | 394 | if (!heap_full(&c->flush_btree)) |
| 386 | best = b; | 395 | heap_add(&c->flush_btree, b, |
| 387 | else if (journal_pin_cmp(c, | 396 | journal_max_cmp); |
| 388 | btree_current_write(best)->journal, | 397 | else if (journal_max_cmp(b, |
| 389 | btree_current_write(b)->journal)) { | 398 | heap_peek(&c->flush_btree))) { |
| 390 | best = b; | 399 | c->flush_btree.data[0] = b; |
| 400 | heap_sift(&c->flush_btree, 0, | ||
| 401 | journal_max_cmp); | ||
| 402 | } | ||
| 391 | } | 403 | } |
| 392 | } | ||
| 393 | 404 | ||
| 394 | b = best; | 405 | for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) |
| 406 | heap_sift(&c->flush_btree, i, journal_min_cmp); | ||
| 407 | } | ||
| 408 | |||
| 409 | b = NULL; | ||
| 410 | heap_pop(&c->flush_btree, b, journal_min_cmp); | ||
| 411 | spin_unlock(&c->journal.lock); | ||
| 412 | |||
| 395 | if (b) { | 413 | if (b) { |
| 396 | mutex_lock(&b->write_lock); | 414 | mutex_lock(&b->write_lock); |
| 397 | if (!btree_current_write(b)->journal) { | 415 | if (!btree_current_write(b)->journal) { |
| 398 | mutex_unlock(&b->write_lock); | 416 | mutex_unlock(&b->write_lock); |
| 399 | /* We raced */ | 417 | /* We raced */ |
| 418 | atomic_long_inc(&c->retry_flush_write); | ||
| 400 | goto retry; | 419 | goto retry; |
| 401 | } | 420 | } |
| 402 | 421 | ||
| @@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c) | |||
| 476 | unsigned iter, n = 0; | 495 | unsigned iter, n = 0; |
| 477 | atomic_t p; | 496 | atomic_t p; |
| 478 | 497 | ||
| 498 | atomic_long_inc(&c->reclaim); | ||
| 499 | |||
| 479 | while (!atomic_read(&fifo_front(&c->journal.pin))) | 500 | while (!atomic_read(&fifo_front(&c->journal.pin))) |
| 480 | fifo_pop(&c->journal.pin, p); | 501 | fifo_pop(&c->journal.pin, p); |
| 481 | 502 | ||
| @@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c) | |||
| 819 | j->w[0].c = c; | 840 | j->w[0].c = c; |
| 820 | j->w[1].c = c; | 841 | j->w[1].c = c; |
| 821 | 842 | ||
| 822 | if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || | 843 | if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || |
| 844 | !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || | ||
| 823 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || | 845 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || |
| 824 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) | 846 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) |
| 825 | return -ENOMEM; | 847 | return -ENOMEM; |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 133b81225ea9..312895788036 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
| @@ -957,7 +957,8 @@ void bch_cached_dev_detach(struct cached_dev *dc) | |||
| 957 | cached_dev_put(dc); | 957 | cached_dev_put(dc); |
| 958 | } | 958 | } |
| 959 | 959 | ||
| 960 | int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | 960 | int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, |
| 961 | uint8_t *set_uuid) | ||
| 961 | { | 962 | { |
| 962 | uint32_t rtime = cpu_to_le32(get_seconds()); | 963 | uint32_t rtime = cpu_to_le32(get_seconds()); |
| 963 | struct uuid_entry *u; | 964 | struct uuid_entry *u; |
| @@ -965,7 +966,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | |||
| 965 | 966 | ||
| 966 | bdevname(dc->bdev, buf); | 967 | bdevname(dc->bdev, buf); |
| 967 | 968 | ||
| 968 | if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) | 969 | if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || |
| 970 | (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) | ||
| 969 | return -ENOENT; | 971 | return -ENOENT; |
| 970 | 972 | ||
| 971 | if (dc->disk.c) { | 973 | if (dc->disk.c) { |
| @@ -1194,7 +1196,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, | |||
| 1194 | 1196 | ||
| 1195 | list_add(&dc->list, &uncached_devices); | 1197 | list_add(&dc->list, &uncached_devices); |
| 1196 | list_for_each_entry(c, &bch_cache_sets, list) | 1198 | list_for_each_entry(c, &bch_cache_sets, list) |
| 1197 | bch_cached_dev_attach(dc, c); | 1199 | bch_cached_dev_attach(dc, c, NULL); |
| 1198 | 1200 | ||
| 1199 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || | 1201 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || |
| 1200 | BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) | 1202 | BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) |
| @@ -1553,7 +1555,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
| 1553 | 1555 | ||
| 1554 | c->congested_read_threshold_us = 2000; | 1556 | c->congested_read_threshold_us = 2000; |
| 1555 | c->congested_write_threshold_us = 20000; | 1557 | c->congested_write_threshold_us = 20000; |
| 1556 | c->error_limit = 8 << IO_ERROR_SHIFT; | 1558 | c->error_limit = DEFAULT_IO_ERROR_LIMIT; |
| 1557 | 1559 | ||
| 1558 | return c; | 1560 | return c; |
| 1559 | err: | 1561 | err: |
| @@ -1716,7 +1718,7 @@ static void run_cache_set(struct cache_set *c) | |||
| 1716 | bcache_write_super(c); | 1718 | bcache_write_super(c); |
| 1717 | 1719 | ||
| 1718 | list_for_each_entry_safe(dc, t, &uncached_devices, list) | 1720 | list_for_each_entry_safe(dc, t, &uncached_devices, list) |
| 1719 | bch_cached_dev_attach(dc, c); | 1721 | bch_cached_dev_attach(dc, c, NULL); |
| 1720 | 1722 | ||
| 1721 | flash_devs_run(c); | 1723 | flash_devs_run(c); |
| 1722 | 1724 | ||
| @@ -1833,6 +1835,7 @@ void bch_cache_release(struct kobject *kobj) | |||
| 1833 | static int cache_alloc(struct cache *ca) | 1835 | static int cache_alloc(struct cache *ca) |
| 1834 | { | 1836 | { |
| 1835 | size_t free; | 1837 | size_t free; |
| 1838 | size_t btree_buckets; | ||
| 1836 | struct bucket *b; | 1839 | struct bucket *b; |
| 1837 | 1840 | ||
| 1838 | __module_get(THIS_MODULE); | 1841 | __module_get(THIS_MODULE); |
| @@ -1840,9 +1843,19 @@ static int cache_alloc(struct cache *ca) | |||
| 1840 | 1843 | ||
| 1841 | bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); | 1844 | bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); |
| 1842 | 1845 | ||
| 1846 | /* | ||
| 1847 | * when ca->sb.njournal_buckets is not zero, journal exists, | ||
| 1848 | * and in bch_journal_replay(), tree node may split, | ||
| 1849 | * so bucket of RESERVE_BTREE type is needed, | ||
| 1850 | * the worst situation is all journal buckets are valid journal, | ||
| 1851 | * and all the keys need to replay, | ||
| 1852 | * so the number of RESERVE_BTREE type buckets should be as much | ||
| 1853 | * as journal buckets | ||
| 1854 | */ | ||
| 1855 | btree_buckets = ca->sb.njournal_buckets ?: 8; | ||
| 1843 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; | 1856 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; |
| 1844 | 1857 | ||
| 1845 | if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || | 1858 | if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) || |
| 1846 | !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || | 1859 | !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || |
| 1847 | !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || | 1860 | !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || |
| 1848 | !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || | 1861 | !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || |
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b4184092c727..78cd7bd50fdd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
| @@ -65,6 +65,9 @@ read_attribute(bset_tree_stats); | |||
| 65 | 65 | ||
| 66 | read_attribute(state); | 66 | read_attribute(state); |
| 67 | read_attribute(cache_read_races); | 67 | read_attribute(cache_read_races); |
| 68 | read_attribute(reclaim); | ||
| 69 | read_attribute(flush_write); | ||
| 70 | read_attribute(retry_flush_write); | ||
| 68 | read_attribute(writeback_keys_done); | 71 | read_attribute(writeback_keys_done); |
| 69 | read_attribute(writeback_keys_failed); | 72 | read_attribute(writeback_keys_failed); |
| 70 | read_attribute(io_errors); | 73 | read_attribute(io_errors); |
| @@ -195,7 +198,7 @@ STORE(__cached_dev) | |||
| 195 | { | 198 | { |
| 196 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | 199 | struct cached_dev *dc = container_of(kobj, struct cached_dev, |
| 197 | disk.kobj); | 200 | disk.kobj); |
| 198 | ssize_t v = size; | 201 | ssize_t v; |
| 199 | struct cache_set *c; | 202 | struct cache_set *c; |
| 200 | struct kobj_uevent_env *env; | 203 | struct kobj_uevent_env *env; |
| 201 | 204 | ||
| @@ -215,7 +218,9 @@ STORE(__cached_dev) | |||
| 215 | sysfs_strtoul_clamp(writeback_rate, | 218 | sysfs_strtoul_clamp(writeback_rate, |
| 216 | dc->writeback_rate.rate, 1, INT_MAX); | 219 | dc->writeback_rate.rate, 1, INT_MAX); |
| 217 | 220 | ||
| 218 | d_strtoul_nonzero(writeback_rate_update_seconds); | 221 | sysfs_strtoul_clamp(writeback_rate_update_seconds, |
| 222 | dc->writeback_rate_update_seconds, | ||
| 223 | 1, WRITEBACK_RATE_UPDATE_SECS_MAX); | ||
| 219 | d_strtoul(writeback_rate_i_term_inverse); | 224 | d_strtoul(writeback_rate_i_term_inverse); |
| 220 | d_strtoul_nonzero(writeback_rate_p_term_inverse); | 225 | d_strtoul_nonzero(writeback_rate_p_term_inverse); |
| 221 | 226 | ||
| @@ -267,17 +272,20 @@ STORE(__cached_dev) | |||
| 267 | } | 272 | } |
| 268 | 273 | ||
| 269 | if (attr == &sysfs_attach) { | 274 | if (attr == &sysfs_attach) { |
| 270 | if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) | 275 | uint8_t set_uuid[16]; |
| 276 | |||
| 277 | if (bch_parse_uuid(buf, set_uuid) < 16) | ||
| 271 | return -EINVAL; | 278 | return -EINVAL; |
| 272 | 279 | ||
| 280 | v = -ENOENT; | ||
| 273 | list_for_each_entry(c, &bch_cache_sets, list) { | 281 | list_for_each_entry(c, &bch_cache_sets, list) { |
| 274 | v = bch_cached_dev_attach(dc, c); | 282 | v = bch_cached_dev_attach(dc, c, set_uuid); |
| 275 | if (!v) | 283 | if (!v) |
| 276 | return size; | 284 | return size; |
| 277 | } | 285 | } |
| 278 | 286 | ||
| 279 | pr_err("Can't attach %s: cache set not found", buf); | 287 | pr_err("Can't attach %s: cache set not found", buf); |
| 280 | size = v; | 288 | return v; |
| 281 | } | 289 | } |
| 282 | 290 | ||
| 283 | if (attr == &sysfs_detach && dc->disk.c) | 291 | if (attr == &sysfs_detach && dc->disk.c) |
| @@ -545,6 +553,15 @@ SHOW(__bch_cache_set) | |||
| 545 | sysfs_print(cache_read_races, | 553 | sysfs_print(cache_read_races, |
| 546 | atomic_long_read(&c->cache_read_races)); | 554 | atomic_long_read(&c->cache_read_races)); |
| 547 | 555 | ||
| 556 | sysfs_print(reclaim, | ||
| 557 | atomic_long_read(&c->reclaim)); | ||
| 558 | |||
| 559 | sysfs_print(flush_write, | ||
| 560 | atomic_long_read(&c->flush_write)); | ||
| 561 | |||
| 562 | sysfs_print(retry_flush_write, | ||
| 563 | atomic_long_read(&c->retry_flush_write)); | ||
| 564 | |||
| 548 | sysfs_print(writeback_keys_done, | 565 | sysfs_print(writeback_keys_done, |
| 549 | atomic_long_read(&c->writeback_keys_done)); | 566 | atomic_long_read(&c->writeback_keys_done)); |
| 550 | sysfs_print(writeback_keys_failed, | 567 | sysfs_print(writeback_keys_failed, |
| @@ -556,7 +573,7 @@ SHOW(__bch_cache_set) | |||
| 556 | 573 | ||
| 557 | /* See count_io_errors for why 88 */ | 574 | /* See count_io_errors for why 88 */ |
| 558 | sysfs_print(io_error_halflife, c->error_decay * 88); | 575 | sysfs_print(io_error_halflife, c->error_decay * 88); |
| 559 | sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); | 576 | sysfs_print(io_error_limit, c->error_limit); |
| 560 | 577 | ||
| 561 | sysfs_hprint(congested, | 578 | sysfs_hprint(congested, |
| 562 | ((uint64_t) bch_get_congested(c)) << 9); | 579 | ((uint64_t) bch_get_congested(c)) << 9); |
| @@ -656,7 +673,7 @@ STORE(__bch_cache_set) | |||
| 656 | } | 673 | } |
| 657 | 674 | ||
| 658 | if (attr == &sysfs_io_error_limit) | 675 | if (attr == &sysfs_io_error_limit) |
| 659 | c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; | 676 | c->error_limit = strtoul_or_return(buf); |
| 660 | 677 | ||
| 661 | /* See count_io_errors() for why 88 */ | 678 | /* See count_io_errors() for why 88 */ |
| 662 | if (attr == &sysfs_io_error_halflife) | 679 | if (attr == &sysfs_io_error_halflife) |
| @@ -731,6 +748,9 @@ static struct attribute *bch_cache_set_internal_files[] = { | |||
| 731 | 748 | ||
| 732 | &sysfs_bset_tree_stats, | 749 | &sysfs_bset_tree_stats, |
| 733 | &sysfs_cache_read_races, | 750 | &sysfs_cache_read_races, |
| 751 | &sysfs_reclaim, | ||
| 752 | &sysfs_flush_write, | ||
| 753 | &sysfs_retry_flush_write, | ||
| 734 | &sysfs_writeback_keys_done, | 754 | &sysfs_writeback_keys_done, |
| 735 | &sysfs_writeback_keys_failed, | 755 | &sysfs_writeback_keys_failed, |
| 736 | 756 | ||
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 4df4c5c1cab2..a6763db7f061 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
| @@ -112,6 +112,8 @@ do { \ | |||
| 112 | 112 | ||
| 113 | #define heap_full(h) ((h)->used == (h)->size) | 113 | #define heap_full(h) ((h)->used == (h)->size) |
| 114 | 114 | ||
| 115 | #define heap_empty(h) ((h)->used == 0) | ||
| 116 | |||
| 115 | #define DECLARE_FIFO(type, name) \ | 117 | #define DECLARE_FIFO(type, name) \ |
| 116 | struct { \ | 118 | struct { \ |
| 117 | size_t front, back, size, mask; \ | 119 | size_t front, back, size, mask; \ |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 51306a19ab03..f1d2fc15abcc 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
| @@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg) | |||
| 564 | 564 | ||
| 565 | while (!kthread_should_stop()) { | 565 | while (!kthread_should_stop()) { |
| 566 | down_write(&dc->writeback_lock); | 566 | down_write(&dc->writeback_lock); |
| 567 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 567 | if (!atomic_read(&dc->has_dirty) || | 568 | if (!atomic_read(&dc->has_dirty) || |
| 568 | (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && | 569 | (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && |
| 569 | !dc->writeback_running)) { | 570 | !dc->writeback_running)) { |
| 570 | up_write(&dc->writeback_lock); | 571 | up_write(&dc->writeback_lock); |
| 571 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 572 | 572 | ||
| 573 | if (kthread_should_stop()) | 573 | if (kthread_should_stop()) { |
| 574 | set_current_state(TASK_RUNNING); | ||
| 574 | return 0; | 575 | return 0; |
| 576 | } | ||
| 575 | 577 | ||
| 576 | schedule(); | 578 | schedule(); |
| 577 | continue; | 579 | continue; |
| 578 | } | 580 | } |
| 581 | set_current_state(TASK_RUNNING); | ||
| 579 | 582 | ||
| 580 | searched_full_index = refill_dirty(dc); | 583 | searched_full_index = refill_dirty(dc); |
| 581 | 584 | ||
| @@ -652,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) | |||
| 652 | dc->writeback_rate.rate = 1024; | 655 | dc->writeback_rate.rate = 1024; |
| 653 | dc->writeback_rate_minimum = 8; | 656 | dc->writeback_rate_minimum = 8; |
| 654 | 657 | ||
| 655 | dc->writeback_rate_update_seconds = 5; | 658 | dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; |
| 656 | dc->writeback_rate_p_term_inverse = 40; | 659 | dc->writeback_rate_p_term_inverse = 40; |
| 657 | dc->writeback_rate_i_term_inverse = 10000; | 660 | dc->writeback_rate_i_term_inverse = 10000; |
| 658 | 661 | ||
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 66f1c527fa24..587b25599856 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h | |||
| @@ -8,6 +8,9 @@ | |||
| 8 | #define MAX_WRITEBACKS_IN_PASS 5 | 8 | #define MAX_WRITEBACKS_IN_PASS 5 |
| 9 | #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ | 9 | #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ |
| 10 | 10 | ||
| 11 | #define WRITEBACK_RATE_UPDATE_SECS_MAX 60 | ||
| 12 | #define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 | ||
| 13 | |||
| 11 | /* | 14 | /* |
| 12 | * 14 (16384ths) is chosen here as something that each backing device | 15 | * 14 (16384ths) is chosen here as something that each backing device |
| 13 | * should be a reasonable fraction of the share, and not to blow up | 16 | * should be a reasonable fraction of the share, and not to blow up |
