diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-02-10 17:05:11 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-02-10 17:05:11 -0500 |
commit | 9454473c9dccb7b9d25e5baf915a082bfd490b33 (patch) | |
tree | 46f7f1a8886088e2f0184f1cf0e47c8ac12d4849 | |
parent | cc5cb5af3a3363bc6f0530703895bf9c5fa2f159 (diff) | |
parent | 8525e5ff456592effe83640ea1702525e35b0363 (diff) |
Merge tag 'for-linus-20180210' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
"A few fixes to round off the merge window on the block side:
- a set of bcache fixes by way of Michael Lyle, from the usual bcache
suspects.
- add a simple-to-hook-into function for bpf EIO error injection.
- fix blk-wbt that mischarectized flushes as reads. Improve the logic
so that flushes and writes are accounted as writes, and only reads
as reads. From me.
- fix requeue crash in BFQ, from Paolo"
* tag 'for-linus-20180210' of git://git.kernel.dk/linux-block:
block, bfq: add requeue-request hook
bcache: fix for data collapse after re-attaching an attached device
bcache: return attach error when no cache set exist
bcache: set writeback_rate_update_seconds in range [1, 60] seconds
bcache: fix for allocator and register thread race
bcache: set error_limit correctly
bcache: properly set task state in bch_writeback_thread()
bcache: fix high CPU occupancy during journal
bcache: add journal statistic
block: Add should_fail_bio() for bpf error injection
blk-wbt: account flush requests correctly
-rw-r--r-- | block/bfq-iosched.c | 107 | ||||
-rw-r--r-- | block/blk-core.c | 11 | ||||
-rw-r--r-- | block/blk-wbt.c | 10 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 9 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 9 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 52 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 25 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 34 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 9 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 3 |
12 files changed, 212 insertions, 63 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 47e6ec7427c4..aeca22d91101 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
@@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
3823 | } | 3823 | } |
3824 | 3824 | ||
3825 | /* | 3825 | /* |
3826 | * We exploit the bfq_finish_request hook to decrement | 3826 | * We exploit the bfq_finish_requeue_request hook to |
3827 | * rq_in_driver, but bfq_finish_request will not be | 3827 | * decrement rq_in_driver, but |
3828 | * invoked on this request. So, to avoid unbalance, | 3828 | * bfq_finish_requeue_request will not be invoked on |
3829 | * just start this request, without incrementing | 3829 | * this request. So, to avoid unbalance, just start |
3830 | * rq_in_driver. As a negative consequence, | 3830 | * this request, without incrementing rq_in_driver. As |
3831 | * rq_in_driver is deceptively lower than it should be | 3831 | * a negative consequence, rq_in_driver is deceptively |
3832 | * while this request is in service. This may cause | 3832 | * lower than it should be while this request is in |
3833 | * bfq_schedule_dispatch to be invoked uselessly. | 3833 | * service. This may cause bfq_schedule_dispatch to be |
3834 | * invoked uselessly. | ||
3834 | * | 3835 | * |
3835 | * As for implementing an exact solution, the | 3836 | * As for implementing an exact solution, the |
3836 | * bfq_finish_request hook, if defined, is probably | 3837 | * bfq_finish_requeue_request hook, if defined, is |
3837 | * invoked also on this request. So, by exploiting | 3838 | * probably invoked also on this request. So, by |
3838 | * this hook, we could 1) increment rq_in_driver here, | 3839 | * exploiting this hook, we could 1) increment |
3839 | * and 2) decrement it in bfq_finish_request. Such a | 3840 | * rq_in_driver here, and 2) decrement it in |
3840 | * solution would let the value of the counter be | 3841 | * bfq_finish_requeue_request. Such a solution would |
3841 | * always accurate, but it would entail using an extra | 3842 | * let the value of the counter be always accurate, |
3842 | * interface function. This cost seems higher than the | 3843 | * but it would entail using an extra interface |
3843 | * benefit, being the frequency of non-elevator-private | 3844 | * function. This cost seems higher than the benefit, |
3845 | * being the frequency of non-elevator-private | ||
3844 | * requests very low. | 3846 | * requests very low. |
3845 | */ | 3847 | */ |
3846 | goto start_rq; | 3848 | goto start_rq; |
@@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q, | |||
4515 | unsigned int cmd_flags) {} | 4517 | unsigned int cmd_flags) {} |
4516 | #endif | 4518 | #endif |
4517 | 4519 | ||
4520 | static void bfq_prepare_request(struct request *rq, struct bio *bio); | ||
4521 | |||
4518 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | 4522 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, |
4519 | bool at_head) | 4523 | bool at_head) |
4520 | { | 4524 | { |
@@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | |||
4541 | else | 4545 | else |
4542 | list_add_tail(&rq->queuelist, &bfqd->dispatch); | 4546 | list_add_tail(&rq->queuelist, &bfqd->dispatch); |
4543 | } else { | 4547 | } else { |
4548 | if (WARN_ON_ONCE(!bfqq)) { | ||
4549 | /* | ||
4550 | * This should never happen. Most likely rq is | ||
4551 | * a requeued regular request, being | ||
4552 | * re-inserted without being first | ||
4553 | * re-prepared. Do a prepare, to avoid | ||
4554 | * failure. | ||
4555 | */ | ||
4556 | bfq_prepare_request(rq, rq->bio); | ||
4557 | bfqq = RQ_BFQQ(rq); | ||
4558 | } | ||
4559 | |||
4544 | idle_timer_disabled = __bfq_insert_request(bfqd, rq); | 4560 | idle_timer_disabled = __bfq_insert_request(bfqd, rq); |
4545 | /* | 4561 | /* |
4546 | * Update bfqq, because, if a queue merge has occurred | 4562 | * Update bfqq, because, if a queue merge has occurred |
@@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) | |||
4697 | bfq_schedule_dispatch(bfqd); | 4713 | bfq_schedule_dispatch(bfqd); |
4698 | } | 4714 | } |
4699 | 4715 | ||
4700 | static void bfq_finish_request_body(struct bfq_queue *bfqq) | 4716 | static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) |
4701 | { | 4717 | { |
4702 | bfqq->allocated--; | 4718 | bfqq->allocated--; |
4703 | 4719 | ||
4704 | bfq_put_queue(bfqq); | 4720 | bfq_put_queue(bfqq); |
4705 | } | 4721 | } |
4706 | 4722 | ||
4707 | static void bfq_finish_request(struct request *rq) | 4723 | /* |
4724 | * Handle either a requeue or a finish for rq. The things to do are | ||
4725 | * the same in both cases: all references to rq are to be dropped. In | ||
4726 | * particular, rq is considered completed from the point of view of | ||
4727 | * the scheduler. | ||
4728 | */ | ||
4729 | static void bfq_finish_requeue_request(struct request *rq) | ||
4708 | { | 4730 | { |
4709 | struct bfq_queue *bfqq; | 4731 | struct bfq_queue *bfqq = RQ_BFQQ(rq); |
4710 | struct bfq_data *bfqd; | 4732 | struct bfq_data *bfqd; |
4711 | 4733 | ||
4712 | if (!rq->elv.icq) | 4734 | /* |
4735 | * Requeue and finish hooks are invoked in blk-mq without | ||
4736 | * checking whether the involved request is actually still | ||
4737 | * referenced in the scheduler. To handle this fact, the | ||
4738 | * following two checks make this function exit in case of | ||
4739 | * spurious invocations, for which there is nothing to do. | ||
4740 | * | ||
4741 | * First, check whether rq has nothing to do with an elevator. | ||
4742 | */ | ||
4743 | if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) | ||
4744 | return; | ||
4745 | |||
4746 | /* | ||
4747 | * rq either is not associated with any icq, or is an already | ||
4748 | * requeued request that has not (yet) been re-inserted into | ||
4749 | * a bfq_queue. | ||
4750 | */ | ||
4751 | if (!rq->elv.icq || !bfqq) | ||
4713 | return; | 4752 | return; |
4714 | 4753 | ||
4715 | bfqq = RQ_BFQQ(rq); | ||
4716 | bfqd = bfqq->bfqd; | 4754 | bfqd = bfqq->bfqd; |
4717 | 4755 | ||
4718 | if (rq->rq_flags & RQF_STARTED) | 4756 | if (rq->rq_flags & RQF_STARTED) |
@@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq) | |||
4727 | spin_lock_irqsave(&bfqd->lock, flags); | 4765 | spin_lock_irqsave(&bfqd->lock, flags); |
4728 | 4766 | ||
4729 | bfq_completed_request(bfqq, bfqd); | 4767 | bfq_completed_request(bfqq, bfqd); |
4730 | bfq_finish_request_body(bfqq); | 4768 | bfq_finish_requeue_request_body(bfqq); |
4731 | 4769 | ||
4732 | spin_unlock_irqrestore(&bfqd->lock, flags); | 4770 | spin_unlock_irqrestore(&bfqd->lock, flags); |
4733 | } else { | 4771 | } else { |
4734 | /* | 4772 | /* |
4735 | * Request rq may be still/already in the scheduler, | 4773 | * Request rq may be still/already in the scheduler, |
4736 | * in which case we need to remove it. And we cannot | 4774 | * in which case we need to remove it (this should |
4775 | * never happen in case of requeue). And we cannot | ||
4737 | * defer such a check and removal, to avoid | 4776 | * defer such a check and removal, to avoid |
4738 | * inconsistencies in the time interval from the end | 4777 | * inconsistencies in the time interval from the end |
4739 | * of this function to the start of the deferred work. | 4778 | * of this function to the start of the deferred work. |
@@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq) | |||
4748 | bfqg_stats_update_io_remove(bfqq_group(bfqq), | 4787 | bfqg_stats_update_io_remove(bfqq_group(bfqq), |
4749 | rq->cmd_flags); | 4788 | rq->cmd_flags); |
4750 | } | 4789 | } |
4751 | bfq_finish_request_body(bfqq); | 4790 | bfq_finish_requeue_request_body(bfqq); |
4752 | } | 4791 | } |
4753 | 4792 | ||
4793 | /* | ||
4794 | * Reset private fields. In case of a requeue, this allows | ||
4795 | * this function to correctly do nothing if it is spuriously | ||
4796 | * invoked again on this same request (see the check at the | ||
4797 | * beginning of the function). Probably, a better general | ||
4798 | * design would be to prevent blk-mq from invoking the requeue | ||
4799 | * or finish hooks of an elevator, for a request that is not | ||
4800 | * referred by that elevator. | ||
4801 | * | ||
4802 | * Resetting the following fields would break the | ||
4803 | * request-insertion logic if rq is re-inserted into a bfq | ||
4804 | * internal queue, without a re-preparation. Here we assume | ||
4805 | * that re-insertions of requeued requests, without | ||
4806 | * re-preparation, can happen only for pass_through or at_head | ||
4807 | * requests (which are not re-inserted into bfq internal | ||
4808 | * queues). | ||
4809 | */ | ||
4754 | rq->elv.priv[0] = NULL; | 4810 | rq->elv.priv[0] = NULL; |
4755 | rq->elv.priv[1] = NULL; | 4811 | rq->elv.priv[1] = NULL; |
4756 | } | 4812 | } |
@@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = { | |||
5426 | .ops.mq = { | 5482 | .ops.mq = { |
5427 | .limit_depth = bfq_limit_depth, | 5483 | .limit_depth = bfq_limit_depth, |
5428 | .prepare_request = bfq_prepare_request, | 5484 | .prepare_request = bfq_prepare_request, |
5429 | .finish_request = bfq_finish_request, | 5485 | .requeue_request = bfq_finish_requeue_request, |
5486 | .finish_request = bfq_finish_requeue_request, | ||
5430 | .exit_icq = bfq_exit_icq, | 5487 | .exit_icq = bfq_exit_icq, |
5431 | .insert_requests = bfq_insert_requests, | 5488 | .insert_requests = bfq_insert_requests, |
5432 | .dispatch_request = bfq_dispatch_request, | 5489 | .dispatch_request = bfq_dispatch_request, |
diff --git a/block/blk-core.c b/block/blk-core.c index d0d104268f1a..2d1a7bbe0634 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/pm_runtime.h> | 34 | #include <linux/pm_runtime.h> |
35 | #include <linux/blk-cgroup.h> | 35 | #include <linux/blk-cgroup.h> |
36 | #include <linux/debugfs.h> | 36 | #include <linux/debugfs.h> |
37 | #include <linux/bpf.h> | ||
37 | 38 | ||
38 | #define CREATE_TRACE_POINTS | 39 | #define CREATE_TRACE_POINTS |
39 | #include <trace/events/block.h> | 40 | #include <trace/events/block.h> |
@@ -2083,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) | |||
2083 | return false; | 2084 | return false; |
2084 | } | 2085 | } |
2085 | 2086 | ||
2087 | static noinline int should_fail_bio(struct bio *bio) | ||
2088 | { | ||
2089 | if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) | ||
2090 | return -EIO; | ||
2091 | return 0; | ||
2092 | } | ||
2093 | ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); | ||
2094 | |||
2086 | /* | 2095 | /* |
2087 | * Remap block n of partition p to block n+start(p) of the disk. | 2096 | * Remap block n of partition p to block n+start(p) of the disk. |
2088 | */ | 2097 | */ |
@@ -2174,7 +2183,7 @@ generic_make_request_checks(struct bio *bio) | |||
2174 | if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) | 2183 | if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) |
2175 | goto not_supported; | 2184 | goto not_supported; |
2176 | 2185 | ||
2177 | if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) | 2186 | if (should_fail_bio(bio)) |
2178 | goto end_io; | 2187 | goto end_io; |
2179 | 2188 | ||
2180 | if (!bio->bi_partno) { | 2189 | if (!bio->bi_partno) { |
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index ae8de9780085..f92fc84b5e2c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c | |||
@@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q) | |||
697 | 697 | ||
698 | static int wbt_data_dir(const struct request *rq) | 698 | static int wbt_data_dir(const struct request *rq) |
699 | { | 699 | { |
700 | return rq_data_dir(rq); | 700 | const int op = req_op(rq); |
701 | |||
702 | if (op == REQ_OP_READ) | ||
703 | return READ; | ||
704 | else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH) | ||
705 | return WRITE; | ||
706 | |||
707 | /* don't account */ | ||
708 | return -1; | ||
701 | } | 709 | } |
702 | 710 | ||
703 | int wbt_init(struct request_queue *q) | 711 | int wbt_init(struct request_queue *q) |
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6cc6c0f9c3a9..458e1d38577d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -287,8 +287,10 @@ do { \ | |||
287 | break; \ | 287 | break; \ |
288 | \ | 288 | \ |
289 | mutex_unlock(&(ca)->set->bucket_lock); \ | 289 | mutex_unlock(&(ca)->set->bucket_lock); \ |
290 | if (kthread_should_stop()) \ | 290 | if (kthread_should_stop()) { \ |
291 | set_current_state(TASK_RUNNING); \ | ||
291 | return 0; \ | 292 | return 0; \ |
293 | } \ | ||
292 | \ | 294 | \ |
293 | schedule(); \ | 295 | schedule(); \ |
294 | mutex_lock(&(ca)->set->bucket_lock); \ | 296 | mutex_lock(&(ca)->set->bucket_lock); \ |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5e2d4e80198e..12e5197f186c 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -658,10 +658,15 @@ struct cache_set { | |||
658 | atomic_long_t writeback_keys_done; | 658 | atomic_long_t writeback_keys_done; |
659 | atomic_long_t writeback_keys_failed; | 659 | atomic_long_t writeback_keys_failed; |
660 | 660 | ||
661 | atomic_long_t reclaim; | ||
662 | atomic_long_t flush_write; | ||
663 | atomic_long_t retry_flush_write; | ||
664 | |||
661 | enum { | 665 | enum { |
662 | ON_ERROR_UNREGISTER, | 666 | ON_ERROR_UNREGISTER, |
663 | ON_ERROR_PANIC, | 667 | ON_ERROR_PANIC, |
664 | } on_error; | 668 | } on_error; |
669 | #define DEFAULT_IO_ERROR_LIMIT 8 | ||
665 | unsigned error_limit; | 670 | unsigned error_limit; |
666 | unsigned error_decay; | 671 | unsigned error_decay; |
667 | 672 | ||
@@ -675,6 +680,8 @@ struct cache_set { | |||
675 | 680 | ||
676 | #define BUCKET_HASH_BITS 12 | 681 | #define BUCKET_HASH_BITS 12 |
677 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; | 682 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; |
683 | |||
684 | DECLARE_HEAP(struct btree *, flush_btree); | ||
678 | }; | 685 | }; |
679 | 686 | ||
680 | struct bbio { | 687 | struct bbio { |
@@ -917,7 +924,7 @@ void bcache_write_super(struct cache_set *); | |||
917 | 924 | ||
918 | int bch_flash_dev_create(struct cache_set *c, uint64_t size); | 925 | int bch_flash_dev_create(struct cache_set *c, uint64_t size); |
919 | 926 | ||
920 | int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); | 927 | int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *); |
921 | void bch_cached_dev_detach(struct cached_dev *); | 928 | void bch_cached_dev_detach(struct cached_dev *); |
922 | void bch_cached_dev_run(struct cached_dev *); | 929 | void bch_cached_dev_run(struct cached_dev *); |
923 | void bcache_device_stop(struct bcache_device *); | 930 | void bcache_device_stop(struct bcache_device *); |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index bf3a48aa9a9a..fad9fe8817eb 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c) | |||
1869 | */ | 1869 | */ |
1870 | for_each_cache(ca, c, i) { | 1870 | for_each_cache(ca, c, i) { |
1871 | for_each_bucket(b, ca) { | 1871 | for_each_bucket(b, ca) { |
1872 | if (fifo_full(&ca->free[RESERVE_PRIO])) | 1872 | if (fifo_full(&ca->free[RESERVE_PRIO]) && |
1873 | fifo_full(&ca->free[RESERVE_BTREE])) | ||
1873 | break; | 1874 | break; |
1874 | 1875 | ||
1875 | if (bch_can_invalidate_bucket(ca, b) && | 1876 | if (bch_can_invalidate_bucket(ca, b) && |
1876 | !GC_MARK(b)) { | 1877 | !GC_MARK(b)) { |
1877 | __bch_invalidate_one_bucket(ca, b); | 1878 | __bch_invalidate_one_bucket(ca, b); |
1878 | fifo_push(&ca->free[RESERVE_PRIO], | 1879 | if (!fifo_push(&ca->free[RESERVE_PRIO], |
1879 | b - ca->buckets); | 1880 | b - ca->buckets)) |
1881 | fifo_push(&ca->free[RESERVE_BTREE], | ||
1882 | b - ca->buckets); | ||
1880 | } | 1883 | } |
1881 | } | 1884 | } |
1882 | } | 1885 | } |
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a87165c1d8e5..1b736b860739 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
@@ -368,6 +368,12 @@ err: | |||
368 | } | 368 | } |
369 | 369 | ||
370 | /* Journalling */ | 370 | /* Journalling */ |
371 | #define journal_max_cmp(l, r) \ | ||
372 | (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ | ||
373 | fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) | ||
374 | #define journal_min_cmp(l, r) \ | ||
375 | (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ | ||
376 | fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) | ||
371 | 377 | ||
372 | static void btree_flush_write(struct cache_set *c) | 378 | static void btree_flush_write(struct cache_set *c) |
373 | { | 379 | { |
@@ -375,28 +381,41 @@ static void btree_flush_write(struct cache_set *c) | |||
375 | * Try to find the btree node with that references the oldest journal | 381 | * Try to find the btree node with that references the oldest journal |
376 | * entry, best is our current candidate and is locked if non NULL: | 382 | * entry, best is our current candidate and is locked if non NULL: |
377 | */ | 383 | */ |
378 | struct btree *b, *best; | 384 | struct btree *b; |
379 | unsigned i; | 385 | int i; |
386 | |||
387 | atomic_long_inc(&c->flush_write); | ||
388 | |||
380 | retry: | 389 | retry: |
381 | best = NULL; | 390 | spin_lock(&c->journal.lock); |
382 | 391 | if (heap_empty(&c->flush_btree)) { | |
383 | for_each_cached_btree(b, c, i) | 392 | for_each_cached_btree(b, c, i) |
384 | if (btree_current_write(b)->journal) { | 393 | if (btree_current_write(b)->journal) { |
385 | if (!best) | 394 | if (!heap_full(&c->flush_btree)) |
386 | best = b; | 395 | heap_add(&c->flush_btree, b, |
387 | else if (journal_pin_cmp(c, | 396 | journal_max_cmp); |
388 | btree_current_write(best)->journal, | 397 | else if (journal_max_cmp(b, |
389 | btree_current_write(b)->journal)) { | 398 | heap_peek(&c->flush_btree))) { |
390 | best = b; | 399 | c->flush_btree.data[0] = b; |
400 | heap_sift(&c->flush_btree, 0, | ||
401 | journal_max_cmp); | ||
402 | } | ||
391 | } | 403 | } |
392 | } | ||
393 | 404 | ||
394 | b = best; | 405 | for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) |
406 | heap_sift(&c->flush_btree, i, journal_min_cmp); | ||
407 | } | ||
408 | |||
409 | b = NULL; | ||
410 | heap_pop(&c->flush_btree, b, journal_min_cmp); | ||
411 | spin_unlock(&c->journal.lock); | ||
412 | |||
395 | if (b) { | 413 | if (b) { |
396 | mutex_lock(&b->write_lock); | 414 | mutex_lock(&b->write_lock); |
397 | if (!btree_current_write(b)->journal) { | 415 | if (!btree_current_write(b)->journal) { |
398 | mutex_unlock(&b->write_lock); | 416 | mutex_unlock(&b->write_lock); |
399 | /* We raced */ | 417 | /* We raced */ |
418 | atomic_long_inc(&c->retry_flush_write); | ||
400 | goto retry; | 419 | goto retry; |
401 | } | 420 | } |
402 | 421 | ||
@@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c) | |||
476 | unsigned iter, n = 0; | 495 | unsigned iter, n = 0; |
477 | atomic_t p; | 496 | atomic_t p; |
478 | 497 | ||
498 | atomic_long_inc(&c->reclaim); | ||
499 | |||
479 | while (!atomic_read(&fifo_front(&c->journal.pin))) | 500 | while (!atomic_read(&fifo_front(&c->journal.pin))) |
480 | fifo_pop(&c->journal.pin, p); | 501 | fifo_pop(&c->journal.pin, p); |
481 | 502 | ||
@@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c) | |||
819 | j->w[0].c = c; | 840 | j->w[0].c = c; |
820 | j->w[1].c = c; | 841 | j->w[1].c = c; |
821 | 842 | ||
822 | if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || | 843 | if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || |
844 | !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || | ||
823 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || | 845 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || |
824 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) | 846 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) |
825 | return -ENOMEM; | 847 | return -ENOMEM; |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 133b81225ea9..312895788036 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -957,7 +957,8 @@ void bch_cached_dev_detach(struct cached_dev *dc) | |||
957 | cached_dev_put(dc); | 957 | cached_dev_put(dc); |
958 | } | 958 | } |
959 | 959 | ||
960 | int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | 960 | int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, |
961 | uint8_t *set_uuid) | ||
961 | { | 962 | { |
962 | uint32_t rtime = cpu_to_le32(get_seconds()); | 963 | uint32_t rtime = cpu_to_le32(get_seconds()); |
963 | struct uuid_entry *u; | 964 | struct uuid_entry *u; |
@@ -965,7 +966,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | |||
965 | 966 | ||
966 | bdevname(dc->bdev, buf); | 967 | bdevname(dc->bdev, buf); |
967 | 968 | ||
968 | if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) | 969 | if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || |
970 | (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) | ||
969 | return -ENOENT; | 971 | return -ENOENT; |
970 | 972 | ||
971 | if (dc->disk.c) { | 973 | if (dc->disk.c) { |
@@ -1194,7 +1196,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, | |||
1194 | 1196 | ||
1195 | list_add(&dc->list, &uncached_devices); | 1197 | list_add(&dc->list, &uncached_devices); |
1196 | list_for_each_entry(c, &bch_cache_sets, list) | 1198 | list_for_each_entry(c, &bch_cache_sets, list) |
1197 | bch_cached_dev_attach(dc, c); | 1199 | bch_cached_dev_attach(dc, c, NULL); |
1198 | 1200 | ||
1199 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || | 1201 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || |
1200 | BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) | 1202 | BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) |
@@ -1553,7 +1555,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1553 | 1555 | ||
1554 | c->congested_read_threshold_us = 2000; | 1556 | c->congested_read_threshold_us = 2000; |
1555 | c->congested_write_threshold_us = 20000; | 1557 | c->congested_write_threshold_us = 20000; |
1556 | c->error_limit = 8 << IO_ERROR_SHIFT; | 1558 | c->error_limit = DEFAULT_IO_ERROR_LIMIT; |
1557 | 1559 | ||
1558 | return c; | 1560 | return c; |
1559 | err: | 1561 | err: |
@@ -1716,7 +1718,7 @@ static void run_cache_set(struct cache_set *c) | |||
1716 | bcache_write_super(c); | 1718 | bcache_write_super(c); |
1717 | 1719 | ||
1718 | list_for_each_entry_safe(dc, t, &uncached_devices, list) | 1720 | list_for_each_entry_safe(dc, t, &uncached_devices, list) |
1719 | bch_cached_dev_attach(dc, c); | 1721 | bch_cached_dev_attach(dc, c, NULL); |
1720 | 1722 | ||
1721 | flash_devs_run(c); | 1723 | flash_devs_run(c); |
1722 | 1724 | ||
@@ -1833,6 +1835,7 @@ void bch_cache_release(struct kobject *kobj) | |||
1833 | static int cache_alloc(struct cache *ca) | 1835 | static int cache_alloc(struct cache *ca) |
1834 | { | 1836 | { |
1835 | size_t free; | 1837 | size_t free; |
1838 | size_t btree_buckets; | ||
1836 | struct bucket *b; | 1839 | struct bucket *b; |
1837 | 1840 | ||
1838 | __module_get(THIS_MODULE); | 1841 | __module_get(THIS_MODULE); |
@@ -1840,9 +1843,19 @@ static int cache_alloc(struct cache *ca) | |||
1840 | 1843 | ||
1841 | bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); | 1844 | bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); |
1842 | 1845 | ||
1846 | /* | ||
1847 | * when ca->sb.njournal_buckets is not zero, journal exists, | ||
1848 | * and in bch_journal_replay(), tree node may split, | ||
1849 | * so bucket of RESERVE_BTREE type is needed, | ||
1850 | * the worst situation is all journal buckets are valid journal, | ||
1851 | * and all the keys need to replay, | ||
1852 | * so the number of RESERVE_BTREE type buckets should be as much | ||
1853 | * as journal buckets | ||
1854 | */ | ||
1855 | btree_buckets = ca->sb.njournal_buckets ?: 8; | ||
1843 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; | 1856 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; |
1844 | 1857 | ||
1845 | if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || | 1858 | if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) || |
1846 | !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || | 1859 | !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || |
1847 | !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || | 1860 | !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || |
1848 | !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || | 1861 | !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || |
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b4184092c727..78cd7bd50fdd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -65,6 +65,9 @@ read_attribute(bset_tree_stats); | |||
65 | 65 | ||
66 | read_attribute(state); | 66 | read_attribute(state); |
67 | read_attribute(cache_read_races); | 67 | read_attribute(cache_read_races); |
68 | read_attribute(reclaim); | ||
69 | read_attribute(flush_write); | ||
70 | read_attribute(retry_flush_write); | ||
68 | read_attribute(writeback_keys_done); | 71 | read_attribute(writeback_keys_done); |
69 | read_attribute(writeback_keys_failed); | 72 | read_attribute(writeback_keys_failed); |
70 | read_attribute(io_errors); | 73 | read_attribute(io_errors); |
@@ -195,7 +198,7 @@ STORE(__cached_dev) | |||
195 | { | 198 | { |
196 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | 199 | struct cached_dev *dc = container_of(kobj, struct cached_dev, |
197 | disk.kobj); | 200 | disk.kobj); |
198 | ssize_t v = size; | 201 | ssize_t v; |
199 | struct cache_set *c; | 202 | struct cache_set *c; |
200 | struct kobj_uevent_env *env; | 203 | struct kobj_uevent_env *env; |
201 | 204 | ||
@@ -215,7 +218,9 @@ STORE(__cached_dev) | |||
215 | sysfs_strtoul_clamp(writeback_rate, | 218 | sysfs_strtoul_clamp(writeback_rate, |
216 | dc->writeback_rate.rate, 1, INT_MAX); | 219 | dc->writeback_rate.rate, 1, INT_MAX); |
217 | 220 | ||
218 | d_strtoul_nonzero(writeback_rate_update_seconds); | 221 | sysfs_strtoul_clamp(writeback_rate_update_seconds, |
222 | dc->writeback_rate_update_seconds, | ||
223 | 1, WRITEBACK_RATE_UPDATE_SECS_MAX); | ||
219 | d_strtoul(writeback_rate_i_term_inverse); | 224 | d_strtoul(writeback_rate_i_term_inverse); |
220 | d_strtoul_nonzero(writeback_rate_p_term_inverse); | 225 | d_strtoul_nonzero(writeback_rate_p_term_inverse); |
221 | 226 | ||
@@ -267,17 +272,20 @@ STORE(__cached_dev) | |||
267 | } | 272 | } |
268 | 273 | ||
269 | if (attr == &sysfs_attach) { | 274 | if (attr == &sysfs_attach) { |
270 | if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) | 275 | uint8_t set_uuid[16]; |
276 | |||
277 | if (bch_parse_uuid(buf, set_uuid) < 16) | ||
271 | return -EINVAL; | 278 | return -EINVAL; |
272 | 279 | ||
280 | v = -ENOENT; | ||
273 | list_for_each_entry(c, &bch_cache_sets, list) { | 281 | list_for_each_entry(c, &bch_cache_sets, list) { |
274 | v = bch_cached_dev_attach(dc, c); | 282 | v = bch_cached_dev_attach(dc, c, set_uuid); |
275 | if (!v) | 283 | if (!v) |
276 | return size; | 284 | return size; |
277 | } | 285 | } |
278 | 286 | ||
279 | pr_err("Can't attach %s: cache set not found", buf); | 287 | pr_err("Can't attach %s: cache set not found", buf); |
280 | size = v; | 288 | return v; |
281 | } | 289 | } |
282 | 290 | ||
283 | if (attr == &sysfs_detach && dc->disk.c) | 291 | if (attr == &sysfs_detach && dc->disk.c) |
@@ -545,6 +553,15 @@ SHOW(__bch_cache_set) | |||
545 | sysfs_print(cache_read_races, | 553 | sysfs_print(cache_read_races, |
546 | atomic_long_read(&c->cache_read_races)); | 554 | atomic_long_read(&c->cache_read_races)); |
547 | 555 | ||
556 | sysfs_print(reclaim, | ||
557 | atomic_long_read(&c->reclaim)); | ||
558 | |||
559 | sysfs_print(flush_write, | ||
560 | atomic_long_read(&c->flush_write)); | ||
561 | |||
562 | sysfs_print(retry_flush_write, | ||
563 | atomic_long_read(&c->retry_flush_write)); | ||
564 | |||
548 | sysfs_print(writeback_keys_done, | 565 | sysfs_print(writeback_keys_done, |
549 | atomic_long_read(&c->writeback_keys_done)); | 566 | atomic_long_read(&c->writeback_keys_done)); |
550 | sysfs_print(writeback_keys_failed, | 567 | sysfs_print(writeback_keys_failed, |
@@ -556,7 +573,7 @@ SHOW(__bch_cache_set) | |||
556 | 573 | ||
557 | /* See count_io_errors for why 88 */ | 574 | /* See count_io_errors for why 88 */ |
558 | sysfs_print(io_error_halflife, c->error_decay * 88); | 575 | sysfs_print(io_error_halflife, c->error_decay * 88); |
559 | sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); | 576 | sysfs_print(io_error_limit, c->error_limit); |
560 | 577 | ||
561 | sysfs_hprint(congested, | 578 | sysfs_hprint(congested, |
562 | ((uint64_t) bch_get_congested(c)) << 9); | 579 | ((uint64_t) bch_get_congested(c)) << 9); |
@@ -656,7 +673,7 @@ STORE(__bch_cache_set) | |||
656 | } | 673 | } |
657 | 674 | ||
658 | if (attr == &sysfs_io_error_limit) | 675 | if (attr == &sysfs_io_error_limit) |
659 | c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; | 676 | c->error_limit = strtoul_or_return(buf); |
660 | 677 | ||
661 | /* See count_io_errors() for why 88 */ | 678 | /* See count_io_errors() for why 88 */ |
662 | if (attr == &sysfs_io_error_halflife) | 679 | if (attr == &sysfs_io_error_halflife) |
@@ -731,6 +748,9 @@ static struct attribute *bch_cache_set_internal_files[] = { | |||
731 | 748 | ||
732 | &sysfs_bset_tree_stats, | 749 | &sysfs_bset_tree_stats, |
733 | &sysfs_cache_read_races, | 750 | &sysfs_cache_read_races, |
751 | &sysfs_reclaim, | ||
752 | &sysfs_flush_write, | ||
753 | &sysfs_retry_flush_write, | ||
734 | &sysfs_writeback_keys_done, | 754 | &sysfs_writeback_keys_done, |
735 | &sysfs_writeback_keys_failed, | 755 | &sysfs_writeback_keys_failed, |
736 | 756 | ||
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 4df4c5c1cab2..a6763db7f061 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -112,6 +112,8 @@ do { \ | |||
112 | 112 | ||
113 | #define heap_full(h) ((h)->used == (h)->size) | 113 | #define heap_full(h) ((h)->used == (h)->size) |
114 | 114 | ||
115 | #define heap_empty(h) ((h)->used == 0) | ||
116 | |||
115 | #define DECLARE_FIFO(type, name) \ | 117 | #define DECLARE_FIFO(type, name) \ |
116 | struct { \ | 118 | struct { \ |
117 | size_t front, back, size, mask; \ | 119 | size_t front, back, size, mask; \ |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 51306a19ab03..f1d2fc15abcc 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg) | |||
564 | 564 | ||
565 | while (!kthread_should_stop()) { | 565 | while (!kthread_should_stop()) { |
566 | down_write(&dc->writeback_lock); | 566 | down_write(&dc->writeback_lock); |
567 | set_current_state(TASK_INTERRUPTIBLE); | ||
567 | if (!atomic_read(&dc->has_dirty) || | 568 | if (!atomic_read(&dc->has_dirty) || |
568 | (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && | 569 | (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && |
569 | !dc->writeback_running)) { | 570 | !dc->writeback_running)) { |
570 | up_write(&dc->writeback_lock); | 571 | up_write(&dc->writeback_lock); |
571 | set_current_state(TASK_INTERRUPTIBLE); | ||
572 | 572 | ||
573 | if (kthread_should_stop()) | 573 | if (kthread_should_stop()) { |
574 | set_current_state(TASK_RUNNING); | ||
574 | return 0; | 575 | return 0; |
576 | } | ||
575 | 577 | ||
576 | schedule(); | 578 | schedule(); |
577 | continue; | 579 | continue; |
578 | } | 580 | } |
581 | set_current_state(TASK_RUNNING); | ||
579 | 582 | ||
580 | searched_full_index = refill_dirty(dc); | 583 | searched_full_index = refill_dirty(dc); |
581 | 584 | ||
@@ -652,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) | |||
652 | dc->writeback_rate.rate = 1024; | 655 | dc->writeback_rate.rate = 1024; |
653 | dc->writeback_rate_minimum = 8; | 656 | dc->writeback_rate_minimum = 8; |
654 | 657 | ||
655 | dc->writeback_rate_update_seconds = 5; | 658 | dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; |
656 | dc->writeback_rate_p_term_inverse = 40; | 659 | dc->writeback_rate_p_term_inverse = 40; |
657 | dc->writeback_rate_i_term_inverse = 10000; | 660 | dc->writeback_rate_i_term_inverse = 10000; |
658 | 661 | ||
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 66f1c527fa24..587b25599856 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h | |||
@@ -8,6 +8,9 @@ | |||
8 | #define MAX_WRITEBACKS_IN_PASS 5 | 8 | #define MAX_WRITEBACKS_IN_PASS 5 |
9 | #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ | 9 | #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ |
10 | 10 | ||
11 | #define WRITEBACK_RATE_UPDATE_SECS_MAX 60 | ||
12 | #define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 | ||
13 | |||
11 | /* | 14 | /* |
12 | * 14 (16384ths) is chosen here as something that each backing device | 15 | * 14 (16384ths) is chosen here as something that each backing device |
13 | * should be a reasonable fraction of the share, and not to blow up | 16 | * should be a reasonable fraction of the share, and not to blow up |