aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-02-10 17:05:11 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-02-10 17:05:11 -0500
commit9454473c9dccb7b9d25e5baf915a082bfd490b33 (patch)
tree46f7f1a8886088e2f0184f1cf0e47c8ac12d4849
parentcc5cb5af3a3363bc6f0530703895bf9c5fa2f159 (diff)
parent8525e5ff456592effe83640ea1702525e35b0363 (diff)
Merge tag 'for-linus-20180210' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: "A few fixes to round off the merge window on the block side: - a set of bcache fixes by way of Michael Lyle, from the usual bcache suspects. - add a simple-to-hook-into function for bpf EIO error injection. - fix blk-wbt that mischarectized flushes as reads. Improve the logic so that flushes and writes are accounted as writes, and only reads as reads. From me. - fix requeue crash in BFQ, from Paolo" * tag 'for-linus-20180210' of git://git.kernel.dk/linux-block: block, bfq: add requeue-request hook bcache: fix for data collapse after re-attaching an attached device bcache: return attach error when no cache set exist bcache: set writeback_rate_update_seconds in range [1, 60] seconds bcache: fix for allocator and register thread race bcache: set error_limit correctly bcache: properly set task state in bch_writeback_thread() bcache: fix high CPU occupancy during journal bcache: add journal statistic block: Add should_fail_bio() for bpf error injection blk-wbt: account flush requests correctly
-rw-r--r--block/bfq-iosched.c107
-rw-r--r--block/blk-core.c11
-rw-r--r--block/blk-wbt.c10
-rw-r--r--drivers/md/bcache/alloc.c4
-rw-r--r--drivers/md/bcache/bcache.h9
-rw-r--r--drivers/md/bcache/btree.c9
-rw-r--r--drivers/md/bcache/journal.c52
-rw-r--r--drivers/md/bcache/super.c25
-rw-r--r--drivers/md/bcache/sysfs.c34
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c9
-rw-r--r--drivers/md/bcache/writeback.h3
12 files changed, 212 insertions, 63 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 47e6ec7427c4..aeca22d91101 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3823 } 3823 }
3824 3824
3825 /* 3825 /*
3826 * We exploit the bfq_finish_request hook to decrement 3826 * We exploit the bfq_finish_requeue_request hook to
3827 * rq_in_driver, but bfq_finish_request will not be 3827 * decrement rq_in_driver, but
3828 * invoked on this request. So, to avoid unbalance, 3828 * bfq_finish_requeue_request will not be invoked on
3829 * just start this request, without incrementing 3829 * this request. So, to avoid unbalance, just start
3830 * rq_in_driver. As a negative consequence, 3830 * this request, without incrementing rq_in_driver. As
3831 * rq_in_driver is deceptively lower than it should be 3831 * a negative consequence, rq_in_driver is deceptively
3832 * while this request is in service. This may cause 3832 * lower than it should be while this request is in
3833 * bfq_schedule_dispatch to be invoked uselessly. 3833 * service. This may cause bfq_schedule_dispatch to be
3834 * invoked uselessly.
3834 * 3835 *
3835 * As for implementing an exact solution, the 3836 * As for implementing an exact solution, the
3836 * bfq_finish_request hook, if defined, is probably 3837 * bfq_finish_requeue_request hook, if defined, is
3837 * invoked also on this request. So, by exploiting 3838 * probably invoked also on this request. So, by
3838 * this hook, we could 1) increment rq_in_driver here, 3839 * exploiting this hook, we could 1) increment
3839 * and 2) decrement it in bfq_finish_request. Such a 3840 * rq_in_driver here, and 2) decrement it in
3840 * solution would let the value of the counter be 3841 * bfq_finish_requeue_request. Such a solution would
3841 * always accurate, but it would entail using an extra 3842 * let the value of the counter be always accurate,
3842 * interface function. This cost seems higher than the 3843 * but it would entail using an extra interface
3843 * benefit, being the frequency of non-elevator-private 3844 * function. This cost seems higher than the benefit,
3845 * being the frequency of non-elevator-private
3844 * requests very low. 3846 * requests very low.
3845 */ 3847 */
3846 goto start_rq; 3848 goto start_rq;
@@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
4515 unsigned int cmd_flags) {} 4517 unsigned int cmd_flags) {}
4516#endif 4518#endif
4517 4519
4520static void bfq_prepare_request(struct request *rq, struct bio *bio);
4521
4518static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 4522static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
4519 bool at_head) 4523 bool at_head)
4520{ 4524{
@@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
4541 else 4545 else
4542 list_add_tail(&rq->queuelist, &bfqd->dispatch); 4546 list_add_tail(&rq->queuelist, &bfqd->dispatch);
4543 } else { 4547 } else {
4548 if (WARN_ON_ONCE(!bfqq)) {
4549 /*
4550 * This should never happen. Most likely rq is
4551 * a requeued regular request, being
4552 * re-inserted without being first
4553 * re-prepared. Do a prepare, to avoid
4554 * failure.
4555 */
4556 bfq_prepare_request(rq, rq->bio);
4557 bfqq = RQ_BFQQ(rq);
4558 }
4559
4544 idle_timer_disabled = __bfq_insert_request(bfqd, rq); 4560 idle_timer_disabled = __bfq_insert_request(bfqd, rq);
4545 /* 4561 /*
4546 * Update bfqq, because, if a queue merge has occurred 4562 * Update bfqq, because, if a queue merge has occurred
@@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
4697 bfq_schedule_dispatch(bfqd); 4713 bfq_schedule_dispatch(bfqd);
4698} 4714}
4699 4715
4700static void bfq_finish_request_body(struct bfq_queue *bfqq) 4716static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
4701{ 4717{
4702 bfqq->allocated--; 4718 bfqq->allocated--;
4703 4719
4704 bfq_put_queue(bfqq); 4720 bfq_put_queue(bfqq);
4705} 4721}
4706 4722
4707static void bfq_finish_request(struct request *rq) 4723/*
4724 * Handle either a requeue or a finish for rq. The things to do are
4725 * the same in both cases: all references to rq are to be dropped. In
4726 * particular, rq is considered completed from the point of view of
4727 * the scheduler.
4728 */
4729static void bfq_finish_requeue_request(struct request *rq)
4708{ 4730{
4709 struct bfq_queue *bfqq; 4731 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4710 struct bfq_data *bfqd; 4732 struct bfq_data *bfqd;
4711 4733
4712 if (!rq->elv.icq) 4734 /*
4735 * Requeue and finish hooks are invoked in blk-mq without
4736 * checking whether the involved request is actually still
4737 * referenced in the scheduler. To handle this fact, the
4738 * following two checks make this function exit in case of
4739 * spurious invocations, for which there is nothing to do.
4740 *
4741 * First, check whether rq has nothing to do with an elevator.
4742 */
4743 if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
4744 return;
4745
4746 /*
4747 * rq either is not associated with any icq, or is an already
4748 * requeued request that has not (yet) been re-inserted into
4749 * a bfq_queue.
4750 */
4751 if (!rq->elv.icq || !bfqq)
4713 return; 4752 return;
4714 4753
4715 bfqq = RQ_BFQQ(rq);
4716 bfqd = bfqq->bfqd; 4754 bfqd = bfqq->bfqd;
4717 4755
4718 if (rq->rq_flags & RQF_STARTED) 4756 if (rq->rq_flags & RQF_STARTED)
@@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq)
4727 spin_lock_irqsave(&bfqd->lock, flags); 4765 spin_lock_irqsave(&bfqd->lock, flags);
4728 4766
4729 bfq_completed_request(bfqq, bfqd); 4767 bfq_completed_request(bfqq, bfqd);
4730 bfq_finish_request_body(bfqq); 4768 bfq_finish_requeue_request_body(bfqq);
4731 4769
4732 spin_unlock_irqrestore(&bfqd->lock, flags); 4770 spin_unlock_irqrestore(&bfqd->lock, flags);
4733 } else { 4771 } else {
4734 /* 4772 /*
4735 * Request rq may be still/already in the scheduler, 4773 * Request rq may be still/already in the scheduler,
4736 * in which case we need to remove it. And we cannot 4774 * in which case we need to remove it (this should
4775 * never happen in case of requeue). And we cannot
4737 * defer such a check and removal, to avoid 4776 * defer such a check and removal, to avoid
4738 * inconsistencies in the time interval from the end 4777 * inconsistencies in the time interval from the end
4739 * of this function to the start of the deferred work. 4778 * of this function to the start of the deferred work.
@@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq)
4748 bfqg_stats_update_io_remove(bfqq_group(bfqq), 4787 bfqg_stats_update_io_remove(bfqq_group(bfqq),
4749 rq->cmd_flags); 4788 rq->cmd_flags);
4750 } 4789 }
4751 bfq_finish_request_body(bfqq); 4790 bfq_finish_requeue_request_body(bfqq);
4752 } 4791 }
4753 4792
4793 /*
4794 * Reset private fields. In case of a requeue, this allows
4795 * this function to correctly do nothing if it is spuriously
4796 * invoked again on this same request (see the check at the
4797 * beginning of the function). Probably, a better general
4798 * design would be to prevent blk-mq from invoking the requeue
4799 * or finish hooks of an elevator, for a request that is not
4800 * referred by that elevator.
4801 *
4802 * Resetting the following fields would break the
4803 * request-insertion logic if rq is re-inserted into a bfq
4804 * internal queue, without a re-preparation. Here we assume
4805 * that re-insertions of requeued requests, without
4806 * re-preparation, can happen only for pass_through or at_head
4807 * requests (which are not re-inserted into bfq internal
4808 * queues).
4809 */
4754 rq->elv.priv[0] = NULL; 4810 rq->elv.priv[0] = NULL;
4755 rq->elv.priv[1] = NULL; 4811 rq->elv.priv[1] = NULL;
4756} 4812}
@@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = {
5426 .ops.mq = { 5482 .ops.mq = {
5427 .limit_depth = bfq_limit_depth, 5483 .limit_depth = bfq_limit_depth,
5428 .prepare_request = bfq_prepare_request, 5484 .prepare_request = bfq_prepare_request,
5429 .finish_request = bfq_finish_request, 5485 .requeue_request = bfq_finish_requeue_request,
5486 .finish_request = bfq_finish_requeue_request,
5430 .exit_icq = bfq_exit_icq, 5487 .exit_icq = bfq_exit_icq,
5431 .insert_requests = bfq_insert_requests, 5488 .insert_requests = bfq_insert_requests,
5432 .dispatch_request = bfq_dispatch_request, 5489 .dispatch_request = bfq_dispatch_request,
diff --git a/block/blk-core.c b/block/blk-core.c
index d0d104268f1a..2d1a7bbe0634 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
34#include <linux/pm_runtime.h> 34#include <linux/pm_runtime.h>
35#include <linux/blk-cgroup.h> 35#include <linux/blk-cgroup.h>
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/bpf.h>
37 38
38#define CREATE_TRACE_POINTS 39#define CREATE_TRACE_POINTS
39#include <trace/events/block.h> 40#include <trace/events/block.h>
@@ -2083,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
2083 return false; 2084 return false;
2084} 2085}
2085 2086
2087static noinline int should_fail_bio(struct bio *bio)
2088{
2089 if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
2090 return -EIO;
2091 return 0;
2092}
2093ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
2094
2086/* 2095/*
2087 * Remap block n of partition p to block n+start(p) of the disk. 2096 * Remap block n of partition p to block n+start(p) of the disk.
2088 */ 2097 */
@@ -2174,7 +2183,7 @@ generic_make_request_checks(struct bio *bio)
2174 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) 2183 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
2175 goto not_supported; 2184 goto not_supported;
2176 2185
2177 if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) 2186 if (should_fail_bio(bio))
2178 goto end_io; 2187 goto end_io;
2179 2188
2180 if (!bio->bi_partno) { 2189 if (!bio->bi_partno) {
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index ae8de9780085..f92fc84b5e2c 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
697 697
698static int wbt_data_dir(const struct request *rq) 698static int wbt_data_dir(const struct request *rq)
699{ 699{
700 return rq_data_dir(rq); 700 const int op = req_op(rq);
701
702 if (op == REQ_OP_READ)
703 return READ;
704 else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
705 return WRITE;
706
707 /* don't account */
708 return -1;
701} 709}
702 710
703int wbt_init(struct request_queue *q) 711int wbt_init(struct request_queue *q)
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 6cc6c0f9c3a9..458e1d38577d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -287,8 +287,10 @@ do { \
287 break; \ 287 break; \
288 \ 288 \
289 mutex_unlock(&(ca)->set->bucket_lock); \ 289 mutex_unlock(&(ca)->set->bucket_lock); \
290 if (kthread_should_stop()) \ 290 if (kthread_should_stop()) { \
291 set_current_state(TASK_RUNNING); \
291 return 0; \ 292 return 0; \
293 } \
292 \ 294 \
293 schedule(); \ 295 schedule(); \
294 mutex_lock(&(ca)->set->bucket_lock); \ 296 mutex_lock(&(ca)->set->bucket_lock); \
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5e2d4e80198e..12e5197f186c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -658,10 +658,15 @@ struct cache_set {
658 atomic_long_t writeback_keys_done; 658 atomic_long_t writeback_keys_done;
659 atomic_long_t writeback_keys_failed; 659 atomic_long_t writeback_keys_failed;
660 660
661 atomic_long_t reclaim;
662 atomic_long_t flush_write;
663 atomic_long_t retry_flush_write;
664
661 enum { 665 enum {
662 ON_ERROR_UNREGISTER, 666 ON_ERROR_UNREGISTER,
663 ON_ERROR_PANIC, 667 ON_ERROR_PANIC,
664 } on_error; 668 } on_error;
669#define DEFAULT_IO_ERROR_LIMIT 8
665 unsigned error_limit; 670 unsigned error_limit;
666 unsigned error_decay; 671 unsigned error_decay;
667 672
@@ -675,6 +680,8 @@ struct cache_set {
675 680
676#define BUCKET_HASH_BITS 12 681#define BUCKET_HASH_BITS 12
677 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; 682 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
683
684 DECLARE_HEAP(struct btree *, flush_btree);
678}; 685};
679 686
680struct bbio { 687struct bbio {
@@ -917,7 +924,7 @@ void bcache_write_super(struct cache_set *);
917 924
918int bch_flash_dev_create(struct cache_set *c, uint64_t size); 925int bch_flash_dev_create(struct cache_set *c, uint64_t size);
919 926
920int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); 927int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *);
921void bch_cached_dev_detach(struct cached_dev *); 928void bch_cached_dev_detach(struct cached_dev *);
922void bch_cached_dev_run(struct cached_dev *); 929void bch_cached_dev_run(struct cached_dev *);
923void bcache_device_stop(struct bcache_device *); 930void bcache_device_stop(struct bcache_device *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index bf3a48aa9a9a..fad9fe8817eb 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c)
1869 */ 1869 */
1870 for_each_cache(ca, c, i) { 1870 for_each_cache(ca, c, i) {
1871 for_each_bucket(b, ca) { 1871 for_each_bucket(b, ca) {
1872 if (fifo_full(&ca->free[RESERVE_PRIO])) 1872 if (fifo_full(&ca->free[RESERVE_PRIO]) &&
1873 fifo_full(&ca->free[RESERVE_BTREE]))
1873 break; 1874 break;
1874 1875
1875 if (bch_can_invalidate_bucket(ca, b) && 1876 if (bch_can_invalidate_bucket(ca, b) &&
1876 !GC_MARK(b)) { 1877 !GC_MARK(b)) {
1877 __bch_invalidate_one_bucket(ca, b); 1878 __bch_invalidate_one_bucket(ca, b);
1878 fifo_push(&ca->free[RESERVE_PRIO], 1879 if (!fifo_push(&ca->free[RESERVE_PRIO],
1879 b - ca->buckets); 1880 b - ca->buckets))
1881 fifo_push(&ca->free[RESERVE_BTREE],
1882 b - ca->buckets);
1880 } 1883 }
1881 } 1884 }
1882 } 1885 }
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index a87165c1d8e5..1b736b860739 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -368,6 +368,12 @@ err:
368} 368}
369 369
370/* Journalling */ 370/* Journalling */
371#define journal_max_cmp(l, r) \
372 (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
373 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
374#define journal_min_cmp(l, r) \
375 (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
376 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
371 377
372static void btree_flush_write(struct cache_set *c) 378static void btree_flush_write(struct cache_set *c)
373{ 379{
@@ -375,28 +381,41 @@ static void btree_flush_write(struct cache_set *c)
375 * Try to find the btree node with that references the oldest journal 381 * Try to find the btree node with that references the oldest journal
376 * entry, best is our current candidate and is locked if non NULL: 382 * entry, best is our current candidate and is locked if non NULL:
377 */ 383 */
378 struct btree *b, *best; 384 struct btree *b;
379 unsigned i; 385 int i;
386
387 atomic_long_inc(&c->flush_write);
388
380retry: 389retry:
381 best = NULL; 390 spin_lock(&c->journal.lock);
382 391 if (heap_empty(&c->flush_btree)) {
383 for_each_cached_btree(b, c, i) 392 for_each_cached_btree(b, c, i)
384 if (btree_current_write(b)->journal) { 393 if (btree_current_write(b)->journal) {
385 if (!best) 394 if (!heap_full(&c->flush_btree))
386 best = b; 395 heap_add(&c->flush_btree, b,
387 else if (journal_pin_cmp(c, 396 journal_max_cmp);
388 btree_current_write(best)->journal, 397 else if (journal_max_cmp(b,
389 btree_current_write(b)->journal)) { 398 heap_peek(&c->flush_btree))) {
390 best = b; 399 c->flush_btree.data[0] = b;
400 heap_sift(&c->flush_btree, 0,
401 journal_max_cmp);
402 }
391 } 403 }
392 }
393 404
394 b = best; 405 for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
406 heap_sift(&c->flush_btree, i, journal_min_cmp);
407 }
408
409 b = NULL;
410 heap_pop(&c->flush_btree, b, journal_min_cmp);
411 spin_unlock(&c->journal.lock);
412
395 if (b) { 413 if (b) {
396 mutex_lock(&b->write_lock); 414 mutex_lock(&b->write_lock);
397 if (!btree_current_write(b)->journal) { 415 if (!btree_current_write(b)->journal) {
398 mutex_unlock(&b->write_lock); 416 mutex_unlock(&b->write_lock);
399 /* We raced */ 417 /* We raced */
418 atomic_long_inc(&c->retry_flush_write);
400 goto retry; 419 goto retry;
401 } 420 }
402 421
@@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c)
476 unsigned iter, n = 0; 495 unsigned iter, n = 0;
477 atomic_t p; 496 atomic_t p;
478 497
498 atomic_long_inc(&c->reclaim);
499
479 while (!atomic_read(&fifo_front(&c->journal.pin))) 500 while (!atomic_read(&fifo_front(&c->journal.pin)))
480 fifo_pop(&c->journal.pin, p); 501 fifo_pop(&c->journal.pin, p);
481 502
@@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c)
819 j->w[0].c = c; 840 j->w[0].c = c;
820 j->w[1].c = c; 841 j->w[1].c = c;
821 842
822 if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || 843 if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
844 !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
823 !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || 845 !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
824 !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) 846 !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
825 return -ENOMEM; 847 return -ENOMEM;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 133b81225ea9..312895788036 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -957,7 +957,8 @@ void bch_cached_dev_detach(struct cached_dev *dc)
957 cached_dev_put(dc); 957 cached_dev_put(dc);
958} 958}
959 959
960int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) 960int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
961 uint8_t *set_uuid)
961{ 962{
962 uint32_t rtime = cpu_to_le32(get_seconds()); 963 uint32_t rtime = cpu_to_le32(get_seconds());
963 struct uuid_entry *u; 964 struct uuid_entry *u;
@@ -965,7 +966,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
965 966
966 bdevname(dc->bdev, buf); 967 bdevname(dc->bdev, buf);
967 968
968 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) 969 if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
970 (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
969 return -ENOENT; 971 return -ENOENT;
970 972
971 if (dc->disk.c) { 973 if (dc->disk.c) {
@@ -1194,7 +1196,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1194 1196
1195 list_add(&dc->list, &uncached_devices); 1197 list_add(&dc->list, &uncached_devices);
1196 list_for_each_entry(c, &bch_cache_sets, list) 1198 list_for_each_entry(c, &bch_cache_sets, list)
1197 bch_cached_dev_attach(dc, c); 1199 bch_cached_dev_attach(dc, c, NULL);
1198 1200
1199 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || 1201 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1200 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) 1202 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
@@ -1553,7 +1555,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1553 1555
1554 c->congested_read_threshold_us = 2000; 1556 c->congested_read_threshold_us = 2000;
1555 c->congested_write_threshold_us = 20000; 1557 c->congested_write_threshold_us = 20000;
1556 c->error_limit = 8 << IO_ERROR_SHIFT; 1558 c->error_limit = DEFAULT_IO_ERROR_LIMIT;
1557 1559
1558 return c; 1560 return c;
1559err: 1561err:
@@ -1716,7 +1718,7 @@ static void run_cache_set(struct cache_set *c)
1716 bcache_write_super(c); 1718 bcache_write_super(c);
1717 1719
1718 list_for_each_entry_safe(dc, t, &uncached_devices, list) 1720 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1719 bch_cached_dev_attach(dc, c); 1721 bch_cached_dev_attach(dc, c, NULL);
1720 1722
1721 flash_devs_run(c); 1723 flash_devs_run(c);
1722 1724
@@ -1833,6 +1835,7 @@ void bch_cache_release(struct kobject *kobj)
1833static int cache_alloc(struct cache *ca) 1835static int cache_alloc(struct cache *ca)
1834{ 1836{
1835 size_t free; 1837 size_t free;
1838 size_t btree_buckets;
1836 struct bucket *b; 1839 struct bucket *b;
1837 1840
1838 __module_get(THIS_MODULE); 1841 __module_get(THIS_MODULE);
@@ -1840,9 +1843,19 @@ static int cache_alloc(struct cache *ca)
1840 1843
1841 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); 1844 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
1842 1845
1846 /*
1847 * when ca->sb.njournal_buckets is not zero, journal exists,
1848 * and in bch_journal_replay(), tree node may split,
1849 * so bucket of RESERVE_BTREE type is needed,
1850 * the worst situation is all journal buckets are valid journal,
1851 * and all the keys need to replay,
1852 * so the number of RESERVE_BTREE type buckets should be as much
1853 * as journal buckets
1854 */
1855 btree_buckets = ca->sb.njournal_buckets ?: 8;
1843 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; 1856 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1844 1857
1845 if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || 1858 if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
1846 !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || 1859 !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1847 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || 1860 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1848 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || 1861 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b4184092c727..78cd7bd50fdd 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -65,6 +65,9 @@ read_attribute(bset_tree_stats);
65 65
66read_attribute(state); 66read_attribute(state);
67read_attribute(cache_read_races); 67read_attribute(cache_read_races);
68read_attribute(reclaim);
69read_attribute(flush_write);
70read_attribute(retry_flush_write);
68read_attribute(writeback_keys_done); 71read_attribute(writeback_keys_done);
69read_attribute(writeback_keys_failed); 72read_attribute(writeback_keys_failed);
70read_attribute(io_errors); 73read_attribute(io_errors);
@@ -195,7 +198,7 @@ STORE(__cached_dev)
195{ 198{
196 struct cached_dev *dc = container_of(kobj, struct cached_dev, 199 struct cached_dev *dc = container_of(kobj, struct cached_dev,
197 disk.kobj); 200 disk.kobj);
198 ssize_t v = size; 201 ssize_t v;
199 struct cache_set *c; 202 struct cache_set *c;
200 struct kobj_uevent_env *env; 203 struct kobj_uevent_env *env;
201 204
@@ -215,7 +218,9 @@ STORE(__cached_dev)
215 sysfs_strtoul_clamp(writeback_rate, 218 sysfs_strtoul_clamp(writeback_rate,
216 dc->writeback_rate.rate, 1, INT_MAX); 219 dc->writeback_rate.rate, 1, INT_MAX);
217 220
218 d_strtoul_nonzero(writeback_rate_update_seconds); 221 sysfs_strtoul_clamp(writeback_rate_update_seconds,
222 dc->writeback_rate_update_seconds,
223 1, WRITEBACK_RATE_UPDATE_SECS_MAX);
219 d_strtoul(writeback_rate_i_term_inverse); 224 d_strtoul(writeback_rate_i_term_inverse);
220 d_strtoul_nonzero(writeback_rate_p_term_inverse); 225 d_strtoul_nonzero(writeback_rate_p_term_inverse);
221 226
@@ -267,17 +272,20 @@ STORE(__cached_dev)
267 } 272 }
268 273
269 if (attr == &sysfs_attach) { 274 if (attr == &sysfs_attach) {
270 if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) 275 uint8_t set_uuid[16];
276
277 if (bch_parse_uuid(buf, set_uuid) < 16)
271 return -EINVAL; 278 return -EINVAL;
272 279
280 v = -ENOENT;
273 list_for_each_entry(c, &bch_cache_sets, list) { 281 list_for_each_entry(c, &bch_cache_sets, list) {
274 v = bch_cached_dev_attach(dc, c); 282 v = bch_cached_dev_attach(dc, c, set_uuid);
275 if (!v) 283 if (!v)
276 return size; 284 return size;
277 } 285 }
278 286
279 pr_err("Can't attach %s: cache set not found", buf); 287 pr_err("Can't attach %s: cache set not found", buf);
280 size = v; 288 return v;
281 } 289 }
282 290
283 if (attr == &sysfs_detach && dc->disk.c) 291 if (attr == &sysfs_detach && dc->disk.c)
@@ -545,6 +553,15 @@ SHOW(__bch_cache_set)
545 sysfs_print(cache_read_races, 553 sysfs_print(cache_read_races,
546 atomic_long_read(&c->cache_read_races)); 554 atomic_long_read(&c->cache_read_races));
547 555
556 sysfs_print(reclaim,
557 atomic_long_read(&c->reclaim));
558
559 sysfs_print(flush_write,
560 atomic_long_read(&c->flush_write));
561
562 sysfs_print(retry_flush_write,
563 atomic_long_read(&c->retry_flush_write));
564
548 sysfs_print(writeback_keys_done, 565 sysfs_print(writeback_keys_done,
549 atomic_long_read(&c->writeback_keys_done)); 566 atomic_long_read(&c->writeback_keys_done));
550 sysfs_print(writeback_keys_failed, 567 sysfs_print(writeback_keys_failed,
@@ -556,7 +573,7 @@ SHOW(__bch_cache_set)
556 573
557 /* See count_io_errors for why 88 */ 574 /* See count_io_errors for why 88 */
558 sysfs_print(io_error_halflife, c->error_decay * 88); 575 sysfs_print(io_error_halflife, c->error_decay * 88);
559 sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); 576 sysfs_print(io_error_limit, c->error_limit);
560 577
561 sysfs_hprint(congested, 578 sysfs_hprint(congested,
562 ((uint64_t) bch_get_congested(c)) << 9); 579 ((uint64_t) bch_get_congested(c)) << 9);
@@ -656,7 +673,7 @@ STORE(__bch_cache_set)
656 } 673 }
657 674
658 if (attr == &sysfs_io_error_limit) 675 if (attr == &sysfs_io_error_limit)
659 c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; 676 c->error_limit = strtoul_or_return(buf);
660 677
661 /* See count_io_errors() for why 88 */ 678 /* See count_io_errors() for why 88 */
662 if (attr == &sysfs_io_error_halflife) 679 if (attr == &sysfs_io_error_halflife)
@@ -731,6 +748,9 @@ static struct attribute *bch_cache_set_internal_files[] = {
731 748
732 &sysfs_bset_tree_stats, 749 &sysfs_bset_tree_stats,
733 &sysfs_cache_read_races, 750 &sysfs_cache_read_races,
751 &sysfs_reclaim,
752 &sysfs_flush_write,
753 &sysfs_retry_flush_write,
734 &sysfs_writeback_keys_done, 754 &sysfs_writeback_keys_done,
735 &sysfs_writeback_keys_failed, 755 &sysfs_writeback_keys_failed,
736 756
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 4df4c5c1cab2..a6763db7f061 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -112,6 +112,8 @@ do { \
112 112
113#define heap_full(h) ((h)->used == (h)->size) 113#define heap_full(h) ((h)->used == (h)->size)
114 114
115#define heap_empty(h) ((h)->used == 0)
116
115#define DECLARE_FIFO(type, name) \ 117#define DECLARE_FIFO(type, name) \
116 struct { \ 118 struct { \
117 size_t front, back, size, mask; \ 119 size_t front, back, size, mask; \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 51306a19ab03..f1d2fc15abcc 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg)
564 564
565 while (!kthread_should_stop()) { 565 while (!kthread_should_stop()) {
566 down_write(&dc->writeback_lock); 566 down_write(&dc->writeback_lock);
567 set_current_state(TASK_INTERRUPTIBLE);
567 if (!atomic_read(&dc->has_dirty) || 568 if (!atomic_read(&dc->has_dirty) ||
568 (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && 569 (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
569 !dc->writeback_running)) { 570 !dc->writeback_running)) {
570 up_write(&dc->writeback_lock); 571 up_write(&dc->writeback_lock);
571 set_current_state(TASK_INTERRUPTIBLE);
572 572
573 if (kthread_should_stop()) 573 if (kthread_should_stop()) {
574 set_current_state(TASK_RUNNING);
574 return 0; 575 return 0;
576 }
575 577
576 schedule(); 578 schedule();
577 continue; 579 continue;
578 } 580 }
581 set_current_state(TASK_RUNNING);
579 582
580 searched_full_index = refill_dirty(dc); 583 searched_full_index = refill_dirty(dc);
581 584
@@ -652,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
652 dc->writeback_rate.rate = 1024; 655 dc->writeback_rate.rate = 1024;
653 dc->writeback_rate_minimum = 8; 656 dc->writeback_rate_minimum = 8;
654 657
655 dc->writeback_rate_update_seconds = 5; 658 dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
656 dc->writeback_rate_p_term_inverse = 40; 659 dc->writeback_rate_p_term_inverse = 40;
657 dc->writeback_rate_i_term_inverse = 10000; 660 dc->writeback_rate_i_term_inverse = 10000;
658 661
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 66f1c527fa24..587b25599856 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -8,6 +8,9 @@
8#define MAX_WRITEBACKS_IN_PASS 5 8#define MAX_WRITEBACKS_IN_PASS 5
9#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ 9#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
10 10
11#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
12#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
13
11/* 14/*
12 * 14 (16384ths) is chosen here as something that each backing device 15 * 14 (16384ths) is chosen here as something that each backing device
13 * should be a reasonable fraction of the share, and not to blow up 16 * should be a reasonable fraction of the share, and not to blow up