From 718822c1c112dc99e0c72c8968ee1db9d9d910f0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 15 Nov 2013 16:12:20 -0500 Subject: dm delay: fix a possible deadlock due to shared workqueue The dm-delay target uses a shared workqueue for multiple instances. This can cause deadlock if two or more dm-delay targets are stacked on the top of each other. This patch changes dm-delay to use a per-instance workqueue. Cc: stable@vger.kernel.org # 2.6.22+ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 496d5f3646a5..2f91d6d4a2cc 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -20,6 +20,7 @@ struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; + struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; atomic_t may_delay; @@ -45,14 +46,13 @@ struct dm_delay_info { static DEFINE_MUTEX(delayed_bios_lock); -static struct workqueue_struct *kdelayd_wq; static struct kmem_cache *delayed_cache; static void handle_delayed_timer(unsigned long data) { struct delay_c *dc = (struct delay_c *)data; - queue_work(kdelayd_wq, &dc->flush_expired_bios); + queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } static void queue_timeout(struct delay_c *dc, unsigned long expires) @@ -191,6 +191,12 @@ out: goto bad_dev_write; } + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + DMERR("Couldn't start kdelayd"); + goto bad_queue; + } + setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); @@ -203,6 +209,8 @@ out: ti->private = dc; return 0; +bad_queue: + mempool_destroy(dc->delayed_pool); bad_dev_write: if (dc->dev_write) dm_put_device(ti, dc->dev_write); @@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - flush_workqueue(kdelayd_wq); + destroy_workqueue(dc->kdelayd_wq); dm_put_device(ti, dc->dev_read); @@ -350,12 +358,6 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); - if (!kdelayd_wq) { - DMERR("Couldn't start kdelayd"); - goto bad_queue; - } - delayed_cache = KMEM_CACHE(dm_delay_info, 0); if (!delayed_cache) { DMERR("Couldn't create delayed bio cache."); @@ -373,8 +375,6 @@ static int __init dm_delay_init(void) bad_register: kmem_cache_destroy(delayed_cache); bad_memcache: - destroy_workqueue(kdelayd_wq); -bad_queue: return r; } @@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void) { dm_unregister_target(&delay_target); kmem_cache_destroy(delayed_cache); - destroy_workqueue(kdelayd_wq); } /* Module hooks */ -- cgit v1.2.2 From c170bbb45febc03ac4d34ba2b8bb55e06104b7e7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2013 16:32:22 -0700 Subject: block: submit_bio_wait() conversions It was being open coded in a few places. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Joern Engel Cc: Prasad Joshi Cc: Neil Brown Cc: Chris Mason Acked-by: NeilBrown Signed-off-by: Jens Axboe --- drivers/md/md.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index b6b7a2866c9e..8700de376876 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -776,16 +776,10 @@ void md_super_wait(struct mddev *mddev) finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); -} - int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); - struct completion event; int ret; rw |= REQ_SYNC; @@ -801,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, else bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, &bio->bi_flags); bio_put(bio); -- cgit v1.2.2 From 0c775d5208284700de423e6746259da54a42e1f5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 25 Nov 2013 11:12:43 +1100 Subject: md/raid5: fix new memory-reference bug in alloc_thread_groups. In alloc_thread_groups, worker_groups is a pointer to an array, not an array of pointers. So worker_groups[i] is wrong. It should be &(*worker_groups)[i] Found-by: coverity Fixes: 60aaf9338545 Reported-by: Ben Hutchings Cc: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 47da0af6322b..676d8b7c5117 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5471,7 +5471,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, for (i = 0; i < *group_cnt; i++) { struct r5worker_group *group; - group = worker_groups[i]; + group = &(*worker_groups)[i]; INIT_LIST_HEAD(&group->handle_list); group->conf = conf; group->workers = workers + i * cnt; -- cgit v1.2.2 From 142d44c310819e1965ca70b4d55d7679f5797e25 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 28 Nov 2013 10:34:18 +1100 Subject: md: test mddev->flags more safely in md_check_recovery. commit 7a0a5355cbc71efa md: Don't test all of mddev->flags at once. made most tests on mddev->flags safer, but missed one. When commit 260fa034ef7a4ff8b7306 md: avoid deadlock when dirty buffers during md_stop. added MD_STILL_CLOSED, this caused md_check_recovery to misbehave. It can think there is something to do but find nothing. This can lead to the md thread spinning during array shutdown. https://bugzilla.kernel.org/show_bug.cgi?id=65721 Reported-and-tested-by: Richard W.M. Jones Fixes: 260fa034ef7a4ff8b7306 Cc: stable@vger.kernel.org (3.12) Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index b6b7a2866c9e..e60cebf3f519 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7777,7 +7777,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags & ~ (1<flags & MD_UPDATE_SB_FLAGS & ~ (1<recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || -- cgit v1.2.2 From 6d183de4077191d1201283a9035ce57a9b05254d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 28 Nov 2013 10:55:27 +1100 Subject: md/raid5: fix newly-broken locking in get_active_stripe. commit 566c09c53455d7c4f1 raid5: relieve lock contention in get_active_stripe() modified the locking in get_active_stripe() reducing the range protected by the (highly contended) device_lock. Unfortunately it reduced the range too much opening up some races. One race can occur if get_priority_stripe runs between the test on sh->count and device_lock being taken. This will mean that sh->lru is not empty while get_active_stripe thinks ->count is zero resulting in a 'BUG' firing. Another race happens if __release_stripe is called immediately after sh->count is tested and found to be non-zero. If STRIPE_HANDLE is not set, get_active_stripe should increment ->active_stripes when it increments ->count from 0, but as it didn't think it was 0, it doesn't. Extending device_lock to cover the test on sh->count close these races. While we are here, fix the two BUG tests: -If count is zero, then lru really must not be empty, or we've lock the stripe_head somehow - no other tests are relevant. -STRIPE_ON_RELEASE_LIST is completely independent of ->lru so testing it is pointless. Reported-and-tested-by: Brassow Jonathan Reviewed-by: Shaohua Li Fixes: 566c09c53455d7c4f1 Signed-off-by: NeilBrown --- drivers/md/raid5.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 676d8b7c5117..cc055da02e2a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -678,26 +678,23 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } else init_stripe(sh, sector, previous); } else { + spin_lock(&conf->device_lock); if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state) && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) - && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); + ); } else { - spin_lock(&conf->device_lock); if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru) && - !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) && - !test_bit(STRIPE_EXPANDING, &sh->state)) - BUG(); + BUG_ON(list_empty(&sh->lru)); list_del_init(&sh->lru); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; } - spin_unlock(&conf->device_lock); } + spin_unlock(&conf->device_lock); } } while (sh == NULL); -- cgit v1.2.2 From 08239ca2a053dbc3b082916bdfbd88e5a9ad9267 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 28 Nov 2013 10:31:35 +0800 Subject: bcache: fix sparse non static symbol warning Fixes the following sparse warning: drivers/md/bcache/btree.c:2220:5: warning: symbol 'btree_insert_fn' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5e2765aadce1..dc6e2be265b7 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2217,7 +2217,7 @@ struct btree_insert_op { struct bkey *replace_key; }; -int btree_insert_fn(struct btree_op *b_op, struct btree *b) +static int btree_insert_fn(struct btree_op *b_op, struct btree *b) { struct btree_insert_op *op = container_of(b_op, struct btree_insert_op, op); -- cgit v1.2.2 From 230c83afdd9cd384348475bea1e14b80b3b6b1b8 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 29 Nov 2013 18:13:37 -0500 Subject: dm snapshot: avoid snapshot space leak on crash There is a possible leak of snapshot space in case of crash. The reason for space leaking is that chunks in the snapshot device are allocated sequentially, but they are finished (and stored in the metadata) out of order, depending on the order in which copying finished. For example, supposed that the metadata contains the following records SUPERBLOCK METADATA (blocks 0 ... 250) DATA 0 DATA 1 DATA 2 ... DATA 250 Now suppose that you allocate 10 new data blocks 251-260. Suppose that copying of these blocks finish out of order (block 260 finished first and the block 251 finished last). Now, the snapshot device looks like this: SUPERBLOCK METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256) DATA 0 DATA 1 DATA 2 ... DATA 250 DATA 251 DATA 252 DATA 253 DATA 254 DATA 255 METADATA (blocks 255, 254, 253, 252, 251) DATA 256 DATA 257 DATA 258 DATA 259 DATA 260 Now, if the machine crashes after writing the first metadata block but before writing the second metadata block, the space for areas DATA 250-255 is leaked, it contains no valid data and it will never be used in the future. This patch makes dm-snapshot complete exceptions in the same order they were allocated, thus fixing this bug. Note: when backporting this patch to the stable kernel, change the version field in the following way: * if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0} * if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it to {1, 10, 2} Userspace reads the version to determine if the bug was fixed, so the version change is needed. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-snap.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index aec57d76db5d..944690bafd93 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -66,6 +66,18 @@ struct dm_snapshot { atomic_t pending_exceptions_count; + /* Protected by "lock" */ + sector_t exception_start_sequence; + + /* Protected by kcopyd single-threaded callback */ + sector_t exception_complete_sequence; + + /* + * A list of pending exceptions that completed out of order. + * Protected by kcopyd single-threaded callback. + */ + struct list_head out_of_order_list; + mempool_t *pending_pool; struct dm_exception_table pending; @@ -173,6 +185,14 @@ struct dm_snap_pending_exception { */ int started; + /* There was copying error. */ + int copy_error; + + /* A sequence number, it is used for in-order completion. */ + sector_t exception_sequence; + + struct list_head out_of_order_entry; + /* * For writing a complete chunk, bypassing the copy. */ @@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->valid = 1; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + s->exception_start_sequence = 0; + s->exception_complete_sequence = 0; + INIT_LIST_HEAD(&s->out_of_order_list); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); @@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success) pending_complete(pe, success); } +static void complete_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + if (unlikely(pe->copy_error)) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); +} + /* * Called when the copy I/O has finished. kcopyd actually runs * this code so don't block. @@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) struct dm_snap_pending_exception *pe = context; struct dm_snapshot *s = pe->snap; - if (read_err || write_err) - pending_complete(pe, 0); + pe->copy_error = read_err || write_err; - else - /* Update the metadata if we are persistent */ - s->store->type->commit_exception(s->store, &pe->e, - commit_callback, pe); + if (pe->exception_sequence == s->exception_complete_sequence) { + s->exception_complete_sequence++; + complete_exception(pe); + + while (!list_empty(&s->out_of_order_list)) { + pe = list_entry(s->out_of_order_list.next, + struct dm_snap_pending_exception, out_of_order_entry); + if (pe->exception_sequence != s->exception_complete_sequence) + break; + s->exception_complete_sequence++; + list_del(&pe->out_of_order_entry); + complete_exception(pe); + } + } else { + struct list_head *lh; + struct dm_snap_pending_exception *pe2; + + list_for_each_prev(lh, &s->out_of_order_list) { + pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); + if (pe2->exception_sequence < pe->exception_sequence) + break; + } + list_add(&pe->out_of_order_entry, lh); + } } /* @@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s, return NULL; } + pe->exception_sequence = s->exception_start_sequence++; + dm_insert_exception(&s->pending, &pe->e); return pe; @@ -2192,7 +2249,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 11, 1}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, -- cgit v1.2.2 From 5b2d06576c5410c10d95adfd5c4d8b24de861d87 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 22 Nov 2013 19:52:06 -0500 Subject: dm table: fail dm_table_create on dm_round_up overflow The dm_round_up function may overflow to zero. In this case, dm_table_create() must fail rather than go on to allocate an empty array with alloc_targets(). This fixes a possible memory corruption that could be caused by passing too large a number in "param->target_count". Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-table.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 465f08ca62b1..3ba6a3859ce3 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode, num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + if (!num_targets) { + kfree(t); + return -ENOMEM; + } + if (alloc_targets(t, num_targets)) { kfree(t); return -ENOMEM; -- cgit v1.2.2 From f62b6b8f498658a9d537c7d380e9966f15e1b2a1 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 2 Dec 2013 16:47:01 -0500 Subject: dm space map metadata: return on failure in sm_metadata_new_block Commit 2fc48021f4afdd109b9e52b6eef5db89ca80bac7 ("dm persistent metadata: add space map threshold callback") introduced a regression to the metadata block allocation path that resulted in errors being ignored. This regression was uncovered by running the following device-mapper-test-suite test: dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/ The ignored error codes in sm_metadata_new_block() could crash the kernel through use of either the dm-thin or dm-cache targets, e.g.: device-mapper: thin: 253:4: reached low water mark for metadata device: sending event. device-mapper: space map metadata: unable to allocate new metadata block general protection fault: 0000 [#1] SMP ... Workqueue: dm-thin do_worker [dm_thin_pool] task: ffff880035ce2ab0 ti: ffff88021a054000 task.ti: ffff88021a054000 RIP: 0010:[] [] metadata_ll_load_ie+0x15/0x30 [dm_persistent_data] RSP: 0018:ffff88021a055a68 EFLAGS: 00010202 RAX: 003fc8243d212ba0 RBX: ffff88021a780070 RCX: ffff88021a055a78 RDX: ffff88021a055a78 RSI: 0040402222a92a80 RDI: ffff88021a780070 RBP: ffff88021a055a68 R08: ffff88021a055ba4 R09: 0000000000000010 R10: 0000000000000000 R11: 00000002a02e1000 R12: ffff88021a055ad4 R13: 0000000000000598 R14: ffffffffa0338470 R15: ffff88021a055ba4 FS: 0000000000000000(0000) GS:ffff88033fca0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f467c0291b8 CR3: 0000000001a0b000 CR4: 00000000000007e0 Stack: ffff88021a055ab8 ffffffffa0332020 ffff88021a055b30 0000000000000001 ffff88021a055b30 0000000000000000 ffff88021a055b18 0000000000000000 ffff88021a055ba4 ffff88021a055b98 ffff88021a055ae8 ffffffffa033304c Call Trace: [] sm_ll_lookup_bitmap+0x40/0xa0 [dm_persistent_data] [] sm_metadata_count_is_more_than_one+0x8c/0xc0 [dm_persistent_data] [] dm_tm_shadow_block+0x65/0x110 [dm_persistent_data] [] sm_ll_mutate+0x80/0x300 [dm_persistent_data] [] ? set_ref_count+0x10/0x10 [dm_persistent_data] [] sm_ll_inc+0x1a/0x20 [dm_persistent_data] [] sm_disk_new_block+0x60/0x80 [dm_persistent_data] [] ? down_write+0x16/0x40 [] dm_pool_alloc_data_block+0x54/0x80 [dm_thin_pool] [] alloc_data_block+0x9c/0x130 [dm_thin_pool] [] provision_block+0x4e/0x180 [dm_thin_pool] [] ? dm_thin_find_block+0x6a/0x110 [dm_thin_pool] [] process_bio+0x1ca/0x1f0 [dm_thin_pool] [] ? mempool_free+0x8d/0xa0 [] process_deferred_bios+0xc5/0x230 [dm_thin_pool] [] do_worker+0x51/0x60 [dm_thin_pool] [] process_one_work+0x182/0x3b0 [] worker_thread+0x120/0x3a0 [] ? manage_workers+0x160/0x160 [] kthread+0xce/0xe0 [] ? kthread_freezable_should_stop+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? kthread_freezable_should_stop+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? kthread_freezable_should_stop+0x70/0x70 Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Cc: stable@vger.kernel.org # v3.10+ --- drivers/md/persistent-data/dm-space-map-metadata.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 1c959684caef..58fc1eef7499 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); int r = sm_metadata_new_block_(sm, b); - if (r) + if (r) { DMERR("unable to allocate new metadata block"); + return r; + } r = sm_metadata_get_nr_free(sm, &count); - if (r) + if (r) { DMERR("couldn't get free block count"); + return r; + } check_threshold(&smm->threshold, count); -- cgit v1.2.2 From fafc7a815e40255d24e80a1cb7365892362fa398 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 2 Dec 2013 17:57:42 -0500 Subject: dm thin: switch to read only mode if a mapping insert fails Switch the thin pool to read-only mode when dm_thin_insert_block() fails since there is little reason to expect the cause of the failure to be resolved without further action by user space. This issue was noticed with the device-mapper-test-suite using: dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/ The quantity of errors logged in this case must be reduced. before patch: device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: dm_thin_insert_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: thin: 253:4: no free metadata space available. device-mapper: thin: 253:4: switching pool to read-only mode after patch: device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: 253:4: dm_thin_insert_block() failed: error = -28 device-mapper: thin: 253:4: switching pool to read-only mode Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-thin.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2c0cf511ec23..24327adc925c 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) */ r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); if (r) { - DMERR_LIMIT("dm_thin_insert_block() failed"); + DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", + dm_device_name(pool->pool_md), r); + set_pool_mode(pool, PM_READ_ONLY); cell_error(pool, m->cell); goto out; } -- cgit v1.2.2 From 4a02b34e0cf1d0d0dd3737702841da4bf615a50a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 3 Dec 2013 12:20:57 -0500 Subject: dm thin: switch to read-only mode if metadata space is exhausted Switch the thin pool to read-only mode in alloc_data_block() if dm_pool_alloc_data_block() fails because the pool's metadata space is exhausted. Differentiate between data and metadata space in messages about no free space available. This issue was noticed with the device-mapper-test-suite using: dmtest run --suite thin-provisioning -n /exhausting_metadata_space_causes_fail_mode/ The quantity of errors logged in this case must be reduced. before patch: device-mapper: thin: 253:4: reached low water mark for metadata device: sending event. device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: space map common: dm_tm_shadow_block() failed device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: 253:4: commit failed: error = -28 device-mapper: thin: 253:4: switching pool to read-only mode after patch: device-mapper: thin: 253:4: reached low water mark for metadata device: sending event. device-mapper: space map metadata: unable to allocate new metadata block device-mapper: thin: 253:4: no free metadata space available. device-mapper: thin: 253:4: switching pool to read-only mode Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Cc: stable@vger.kernel.org --- drivers/md/dm-thin.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 24327adc925c..44ee489f4a6e 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -959,7 +959,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) * table reload). */ if (!free_blocks) { - DMWARN("%s: no free space available.", + DMWARN("%s: no free data space available.", dm_device_name(pool->pool_md)); spin_lock_irqsave(&pool->lock, flags); pool->no_free_space = 1; @@ -969,8 +969,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) } r = dm_pool_alloc_data_block(pool->pmd, result); - if (r) + if (r) { + if (r == -ENOSPC && + !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && + !free_blocks) { + DMWARN("%s: no free metadata space available.", + dm_device_name(pool->pool_md)); + set_pool_mode(pool, PM_READ_ONLY); + } return r; + } return 0; } -- cgit v1.2.2 From 020cc3b5e28c2e24f59f53a9154faf08564f308e Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 4 Dec 2013 15:05:36 -0500 Subject: dm thin: always fallback the pool mode if commit fails Rename commit_or_fallback() to commit(). Now all previous calls to commit() will trigger the pool mode to fallback if the commit fails. Also, check the error returned from commit() in alloc_data_block(). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-thin.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 44ee489f4a6e..af79bae5ab74 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -883,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, } } -static int commit(struct pool *pool) -{ - int r; - - r = dm_pool_commit_metadata(pool->pmd); - if (r) - DMERR_LIMIT("%s: commit failed: error = %d", - dm_device_name(pool->pool_md), r); - - return r; -} - /* * A non-zero return indicates read_only or fail_io mode. * Many callers don't care about the return value. */ -static int commit_or_fallback(struct pool *pool) +static int commit(struct pool *pool) { int r; if (get_pool_mode(pool) != PM_WRITE) return -EINVAL; - r = commit(pool); - if (r) + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", + dm_device_name(pool->pool_md), r); set_pool_mode(pool, PM_READ_ONLY); + } return r; } @@ -945,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) * Try to commit to see if that will free up some * more space. */ - (void) commit_or_fallback(pool); + r = commit(pool); + if (r) + return r; r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) @@ -1359,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool) if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) return; - if (commit_or_fallback(pool)) { + if (commit(pool)) { while ((bio = bio_list_pop(&bios))) bio_io_error(bio); return; @@ -2276,7 +2269,7 @@ static int pool_preresume(struct dm_target *ti) return r; if (need_commit1 || need_commit2) - (void) commit_or_fallback(pool); + (void) commit(pool); return 0; } @@ -2303,7 +2296,7 @@ static void pool_postsuspend(struct dm_target *ti) cancel_delayed_work(&pool->waker); flush_workqueue(pool->wq); - (void) commit_or_fallback(pool); + (void) commit(pool); } static int check_arg_count(unsigned argc, unsigned args_required) @@ -2437,7 +2430,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct if (r) return r; - (void) commit_or_fallback(pool); + (void) commit(pool); r = dm_pool_reserve_metadata_snap(pool->pmd); if (r) @@ -2499,7 +2492,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) DMWARN("Unrecognised thin pool target message received: %s", argv[0]); if (!r) - (void) commit_or_fallback(pool); + (void) commit(pool); return r; } @@ -2554,7 +2547,7 @@ static void pool_status(struct dm_target *ti, status_type_t type, /* Commit to ensure statistics aren't out-of-date */ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) - (void) commit_or_fallback(pool); + (void) commit(pool); r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); if (r) { -- cgit v1.2.2 From 5383ef3a929a1366e2ced45cd6d74be7aa2a2281 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 4 Dec 2013 16:30:01 -0500 Subject: dm thin: re-establish read-only state when switching to fail mode If the thin-pool transitioned to fail mode and the thin-pool's table were reloaded for some reason: the new table's default pool mode would be read-write, though it will transition to fail mode during resume. When the pool mode transitions directly from PM_WRITE to PM_FAIL we need to re-establish the intermediate read-only state in both the metadata and persistent-data block manager (as is usually done with the normal pool mode transition sequence: PM_WRITE -> PM_READ_ONLY -> PM_FAIL). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-thin.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index af79bae5ab74..0d7852e2b275 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1400,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) case PM_FAIL: DMERR("%s: switching pool to failure mode", dm_device_name(pool->pool_md)); + dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; -- cgit v1.2.2 From 9b7aaa64f96f7ca280d75326fca42f42017b89ef Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 4 Dec 2013 16:58:19 -0500 Subject: dm thin: allow pool in read-only mode to transition to read-write mode A thin-pool may be in read-only mode because the pool's data or metadata space was exhausted. To allow for recovery, by adding more space to the pool, we must allow a pool to transition from PM_READ_ONLY to PM_WRITE mode. Otherwise, running out of space will render the pool permanently read-only. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-thin-metadata.c | 8 ++++++++ drivers/md/dm-thin-metadata.h | 1 + drivers/md/dm-thin.c | 12 ++++++++++-- drivers/md/persistent-data/dm-block-manager.c | 6 ++++++ drivers/md/persistent-data/dm-block-manager.h | 7 ++++--- 5 files changed, 29 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 60bce435f4fa..8a30ad54bd46 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) up_write(&pmd->root_lock); } +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) +{ + down_write(&pmd->root_lock); + pmd->read_only = false; + dm_bm_set_read_write(pmd->bm); + up_write(&pmd->root_lock); +} + int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_block_t threshold, dm_sm_threshold_fn fn, diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 845ebbe589a9..7bcc0e1d6238 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz * that nothing is changing. */ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd); int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_block_t threshold, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 0d7852e2b275..ee29037ffc2e 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1425,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) break; case PM_WRITE: + dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; pool->process_discard = process_discard; pool->process_prepared_mapping = process_prepared_mapping; @@ -1641,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) struct pool_c *pt = ti->private; /* - * We want to make sure that degraded pools are never upgraded. + * We want to make sure that a pool in PM_FAIL mode is never upgraded. */ enum pool_mode old_mode = pool->pf.mode; enum pool_mode new_mode = pt->adjusted_pf.mode; - if (old_mode > new_mode) + /* + * If we were in PM_FAIL mode, rollback of metadata failed. We're + * not going to recover without a thin_repair. So we never let the + * pool move out of the old mode. On the other hand a PM_READ_ONLY + * may have been due to a lack of metadata or data space, and may + * now work (ie. if the underlying devices have been resized). + */ + if (old_mode == PM_FAIL) new_mode = old_mode; pool->ti = ti; diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index a7e8bf296388..064a3c271baa 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm) } EXPORT_SYMBOL_GPL(dm_bm_set_read_only); +void dm_bm_set_read_write(struct dm_block_manager *bm) +{ + bm->read_only = false; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_write); + u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) { return crc32c(~(u32) 0, data, len) ^ init_xor; diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 9a82083a66b6..13cd58e1fe69 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b); int dm_bm_flush_and_unlock(struct dm_block_manager *bm, struct dm_block *superblock); - /* - * Request data be prefetched into the cache. - */ +/* + * Request data is prefetched into the cache. + */ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); /* @@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); * be returned if you do. */ void dm_bm_set_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_write(struct dm_block_manager *bm); u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); -- cgit v1.2.2 From af95e7a69b54bca48092e3013a92cfa3043c9c73 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 15 Nov 2013 10:51:20 +0000 Subject: dm cache policy mq: fix promotions to occur as expected Micro benchmarks that repeatedly issued IO to a single block were failing to cause a promotion from the origin device to the cache. Fix this by not updating the stats during map() if -EWOULDBLOCK will be returned. The mq policy will only update stats, consider migration, etc, once per tick period (a unit of time established between dm-cache core and the policies). When the IO thread calls the policy's map method, if it would like to migrate the associated block it returns -EWOULDBLOCK, the IO then gets handed over to a worker thread which handles the migration. The worker thread calls map again, to check the migration is still needed (avoids a race among other things). *BUT*, before this fix, if we were still in the same tick period the stats were already updated by the previous map call -- so the migration would no longer be requested. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-mq.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 416b7b752a6e..64780ad73bb0 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, int r = 0; bool updated = updated_this_tick(mq, e); - requeue_and_update_tick(mq, e); - if ((!discarded_oblock && updated) || - !should_promote(mq, e, discarded_oblock, data_dir)) + !should_promote(mq, e, discarded_oblock, data_dir)) { + requeue_and_update_tick(mq, e); result->op = POLICY_MISS; - else if (!can_migrate) + + } else if (!can_migrate) r = -EWOULDBLOCK; - else + + else { + requeue_and_update_tick(mq, e); r = pre_cache_to_cache(mq, e, result); + } return r; } -- cgit v1.2.2 From 088448007bb97af47ec3f05fc3e9517ffb5e9fba Mon Sep 17 00:00:00 2001 From: Vincent Pelletier Date: Sat, 30 Nov 2013 12:58:42 +0100 Subject: dm cache: actually resize cache Commit f494a9c6b1b6dd9a9f21bbb75d9210d478eeb498 ("dm cache: cache shrinking support") broke cache resizing support. dm_cache_resize() is called with cache->cache_size before it gets updated to new_size, so it is a no-op. But the dm-cache superblock is updated with the new_size even though the backing dm-array is not resized. Fix this by passing the new_size to dm_cache_resize(). Signed-off-by: Vincent Pelletier Acked-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9efcf1059b99..1b1469ebe5cb 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) { int r; - r = dm_cache_resize(cache->cmd, cache->cache_size); + r = dm_cache_resize(cache->cmd, new_size); if (r) { DMERR("could not resize cache metadata"); return r; -- cgit v1.2.2 From 4cb57ab4a2e61978f3a9b7d4f53988f30d61c27f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 5 Dec 2013 17:33:29 -0500 Subject: dm bufio: initialize read-only module parameters Some module parameters in dm-bufio are read-only. These parameters inform the user about memory consumption. They are not supposed to be changed by the user. However, despite being read-only, these parameters can be set on modprobe or insmod command line, for example: modprobe dm-bufio current_allocated_bytes=12345 The kernel doesn't expect that these variables can be non-zero at module initialization and if the user sets them, it results in BUG. This patch initializes the variables in the module init routine, so that user-supplied values are ignored. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.2+ --- drivers/md/dm-bufio.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 173cbb20d104..54bdd923316f 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void) { __u64 mem; + dm_bufio_allocated_kmem_cache = 0; + dm_bufio_allocated_get_free_pages = 0; + dm_bufio_allocated_vmalloc = 0; + dm_bufio_current_allocated = 0; + memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); -- cgit v1.2.2 From 76f5bee5c3b45c617f91243e85547fc8f67bc678 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 5 Dec 2013 17:34:19 -0500 Subject: dm stats: initialize read-only module parameter The module parameter stats_current_allocated_bytes in dm-mod is read-only. This parameter informs the user about memory consumption. It is not supposed to be changed by the user. However, despite being read-only, this parameter can be set on modprobe or insmod command line: modprobe dm-mod stats_current_allocated_bytes=12345 The kernel doesn't expect that this variable can be non-zero at module initialization and if the user sets it, it results in warning. This patch initializes the variable in the module init routine, so that user-supplied value is ignored. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.12+ --- drivers/md/dm-stats.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 3d404c1371ed..28a90122a5a8 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, int __init dm_statistics_init(void) { + shared_memory_amount = 0; dm_stat_need_rcu_barrier = 0; return 0; } -- cgit v1.2.2 From 5b564d80f8bc21094c0cd2b19b679d983aabcc29 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 13 Dec 2013 12:31:08 +0000 Subject: dm space map: disallow decrementing a reference count below zero The old behaviour, returning -EINVAL if a ref_count of 0 would be decremented, was removed in commit f722063 ("dm space map: optimise sm_ll_dec and sm_ll_inc"). To fix this regression we return an error code from the mutator function pointer passed to sm_ll_mutate() and have dec_ref_count() return -EINVAL if the old ref_count is 0. Add a DMERR to reflect the potential seriousness of this error. Also, add missing dm_tm_unlock() to sm_ll_mutate()'s error path. With this fix the following dmts regression test now passes: dmtest run --suite cache -n /metadata_use_kernel/ The next patch fixes the higher-level dm-array code that exposed this regression. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.12+ --- drivers/md/persistent-data/dm-space-map-common.c | 32 +++++++++++++++++------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 6058569fe86c..466a60bbd716 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, } static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, - uint32_t (*mutator)(void *context, uint32_t old), + int (*mutator)(void *context, uint32_t old, uint32_t *new), void *context, enum allocation_event *ev) { int r; @@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, if (old > 2) { r = sm_ll_lookup_big_ref_count(ll, b, &old); - if (r < 0) + if (r < 0) { + dm_tm_unlock(ll->tm, nb); return r; + } } - ref_count = mutator(context, old); + r = mutator(context, old, &ref_count); + if (r) { + dm_tm_unlock(ll->tm, nb); + return r; + } if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); @@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, return ll->save_ie(ll, index, &ie_disk); } -static uint32_t set_ref_count(void *context, uint32_t old) +static int set_ref_count(void *context, uint32_t old, uint32_t *new) { - return *((uint32_t *) context); + *new = *((uint32_t *) context); + return 0; } int sm_ll_insert(struct ll_disk *ll, dm_block_t b, @@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); } -static uint32_t inc_ref_count(void *context, uint32_t old) +static int inc_ref_count(void *context, uint32_t old, uint32_t *new) { - return old + 1; + *new = old + 1; + return 0; } int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) @@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); } -static uint32_t dec_ref_count(void *context, uint32_t old) +static int dec_ref_count(void *context, uint32_t old, uint32_t *new) { - return old - 1; + if (!old) { + DMERR_LIMIT("unable to decrement a reference count below 0"); + return -EINVAL; + } + + *new = old - 1; + return 0; } int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) -- cgit v1.2.2 From ed9571f0cf1fe09d3506302610f3ccdfa1d22c4a Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 13 Dec 2013 14:55:55 +0000 Subject: dm array: fix a reference counting bug in shadow_ablock An old array block could have its reference count decremented below zero when it is being replaced in the btree by a new array block. The fix is to increment the old ablock's reference count just before inserting a new ablock into the btree. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.9+ --- drivers/md/persistent-data/dm-array.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index af96e24ec328..1d75b1dc1e2e 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, * The shadow op will often be a noop. Only insert if it really * copied data. */ - if (dm_block_location(*block) != b) + if (dm_block_location(*block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); r = insert_ablock(info, index, *block, root); + } return r; } -- cgit v1.2.2 From f665c0f852316ff99e9eb7f71f34d43003f8e139 Mon Sep 17 00:00:00 2001 From: Stefan Priebe Date: Sat, 16 Nov 2013 21:26:52 +0100 Subject: bcache: kthread don't set writeback task to INTERUPTIBLE at the beginning (schedule_timout_interuptible) and others do his on their own This prevents wrong load average calculation (load of 1 per thread) Signed-off-by: Kent Overstreet --- drivers/md/bcache/writeback.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 99053b1251be..484e57d7012c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -500,8 +500,6 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc) if (IS_ERR(dc->writeback_thread)) return PTR_ERR(dc->writeback_thread); - set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); -- cgit v1.2.2 From ce2b3f595e1c56639085645e0130426e443008c0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2013 17:28:37 -0800 Subject: bcache: Use uninterruptible sleep in writeback We're just waiting on kthread_should_stop(), nothing else, so interruptible sleep was wrong here. Signed-off-by: Kent Overstreet --- drivers/md/bcache/writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 484e57d7012c..3cd931d3f26c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -241,7 +241,7 @@ static void read_dirty(struct cached_dev *dc) if (KEY_START(&w->key) != dc->last_read || jiffies_to_msecs(delay) > 50) while (!kthread_should_stop() && delay) - delay = schedule_timeout_interruptible(delay); + delay = schedule_timeout_uninterruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -438,7 +438,7 @@ static int bch_writeback_thread(void *arg) while (delay && !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); + delay = schedule_timeout_uninterruptible(delay); } } -- cgit v1.2.2 From d24a6e1087030b6da286df9433add5fa2f21b83b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Nov 2013 21:55:27 -0800 Subject: bcache: Fix dirty_data accounting Dirty data accounting wasn't quite right - firstly, we were adding the key we're inserting after it could have merged with another dirty key already in the btree, and secondly we could sometimes pass the wrong offset to bcache_dev_sectors_dirty_add() for dirty data we were overwriting - which is important when tracking dirty data by stripe. NOTE FOR BACKPORTERS: For 3.10 (and 3.11?) there's other accounting fixes necessary that got squashed in with other patches; the full patch against 3.10 is 408cc2f47eeac93a, available at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-3.10-writeback-fixes Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 2a46036..4a12b2f 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1817,7 +1817,8 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, if (KEY_START(k) > KEY_START(insert) + sectors_found) goto check_failed; - if (KEY_PTRS(replace_key) != KEY_PTRS(k)) + if (KEY_PTRS(k) != KEY_PTRS(replace_key) || + KEY_DIRTY(k) != KEY_DIRTY(replace_key)) goto check_failed; /* skip past gen */ --- drivers/md/bcache/btree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index dc6e2be265b7..c36745e5986a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1817,7 +1817,8 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, if (KEY_START(k) > KEY_START(insert) + sectors_found) goto check_failed; - if (KEY_PTRS(replace_key) != KEY_PTRS(k)) + if (KEY_PTRS(k) != KEY_PTRS(replace_key) || + KEY_DIRTY(k) != KEY_DIRTY(replace_key)) goto check_failed; /* skip past gen */ -- cgit v1.2.2 From 9eb8ebeb2471b8cbaa078066aeb85fe5d46f5304 Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Tue, 22 Oct 2013 13:19:23 -0700 Subject: bcache: Fix for can_attach_cache() Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dec15cd2d797..c57bfa071a57 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1676,7 +1676,7 @@ err: static bool can_attach_cache(struct cache *ca, struct cache_set *c) { return ca->sb.block_size == c->sb.block_size && - ca->sb.bucket_size == c->sb.block_size && + ca->sb.bucket_size == c->sb.bucket_size && ca->sb.nr_in_set == c->sb.nr_in_set; } -- cgit v1.2.2 From 97d11a660fd906dbea3dccd2638495d8497c3c81 Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Wed, 23 Oct 2013 17:35:26 -0700 Subject: bcache: Fix heap_peek() macro Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 362c4b3f8b4a..1030c6020e98 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -110,7 +110,7 @@ do { \ _r; \ }) -#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) #define heap_full(h) ((h)->used == (h)->size) -- cgit v1.2.2 From bee63f40cb5f5e8ab2abfbc85acde99cc0acd4b5 Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Thu, 31 Oct 2013 19:25:18 -0700 Subject: bcache: fix for gc crashing when no sectors are used Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/movinggc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 7c1275e66025..46c952379fab 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -184,7 +184,8 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r) static unsigned bucket_heap_top(struct cache *ca) { - return GC_SECTORS_USED(heap_peek(&ca->heap)); + struct bucket *b; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } void bch_moving_gc(struct cache_set *c) -- cgit v1.2.2 From 981aa8c091e164ea51dd1e81b71a1f3852bbcceb Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Thu, 7 Nov 2013 17:53:19 -0800 Subject: bcache: bugfix - moving_gc now moves only correct buckets Removed gc_move_threshold because picking buckets only by threshold could lead moving extra buckets (ei. if there are buckets at the threshold that aren't supposed to be moved do to space considerations). This is replaced by a GC_MOVE bit in the gc_mark bitmask. Now only marked buckets get moved. Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 2 ++ drivers/md/bcache/bcache.h | 6 +++--- drivers/md/bcache/movinggc.c | 8 +++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 2b46bf1d7e40..4c9852d92b0a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -421,9 +421,11 @@ out: if (watermark <= WATERMARK_METADATA) { SET_GC_MARK(b, GC_MARK_METADATA); + SET_GC_MOVE(b, 0); b->prio = BTREE_PRIO; } else { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MOVE(b, 0); b->prio = INITIAL_PRIO; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4beb55a0ff30..a7b1a7631ed2 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -197,7 +197,7 @@ struct bucket { uint8_t disk_gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint8_t gc_gen; - uint16_t gc_mark; + uint16_t gc_mark; /* Bitfield used by GC. See below for field */ }; /* @@ -209,7 +209,8 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_RECLAIMABLE 0 #define GC_MARK_DIRTY 1 #define GC_MARK_METADATA 2 -BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); +BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); #include "journal.h" #include "stats.h" @@ -445,7 +446,6 @@ struct cache { * call prio_write() to keep gens from wrapping. */ uint8_t need_save_prio; - unsigned gc_move_threshold; /* * If nonzero, we know we aren't going to find any buckets to invalidate diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 46c952379fab..30f347d4e609 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -25,10 +25,9 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) unsigned i; for (i = 0; i < KEY_PTRS(k); i++) { - struct cache *ca = PTR_CACHE(c, k, i); struct bucket *g = PTR_BUCKET(c, k, i); - if (GC_SECTORS_USED(g) < ca->gc_move_threshold) + if (GC_MOVE(g)) return true; } @@ -227,9 +226,8 @@ void bch_moving_gc(struct cache_set *c) sectors_to_move -= GC_SECTORS_USED(b); } - ca->gc_move_threshold = bucket_heap_top(ca); - - pr_debug("threshold %u", ca->gc_move_threshold); + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); } mutex_unlock(&c->bucket_lock); -- cgit v1.2.2 From bf0a628a95dba7f983b6047cea695fb066fb2512 Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Tue, 26 Nov 2013 19:14:23 -0800 Subject: bcache: fix for gc and writeback race Garbage collector needs to check keys in the writeback keybuf to make sure it's not invalidating buckets to which the writeback keys point to. Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index c36745e5986a..31bb53fcc67a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1561,6 +1561,28 @@ size_t bch_btree_gc_finish(struct cache_set *c) SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), GC_MARK_METADATA); + /* don't reclaim buckets to which writeback keys point */ + rcu_read_lock(); + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + struct cached_dev *dc; + struct keybuf_key *w, *n; + unsigned j; + + if (!d || UUID_FLASH_ONLY(&c->uuids[i])) + continue; + dc = container_of(d, struct cached_dev, disk); + + spin_lock(&dc->writeback_keys.lock); + rbtree_postorder_for_each_entry_safe(w, n, + &dc->writeback_keys.keys, node) + for (j = 0; j < KEY_PTRS(&w->key); j++) + SET_GC_MARK(PTR_BUCKET(c, &w->key, j), + GC_MARK_DIRTY); + spin_unlock(&dc->writeback_keys.lock); + } + rcu_read_unlock(); + for_each_cache(ca, c, i) { uint64_t *i; -- cgit v1.2.2 From 6d3d1a9c542b19dff1c7d7c8354d0869e4655287 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Dec 2013 14:12:09 -0800 Subject: bcache: bugfix for race between moving_gc and bucket_invalidate There is a possibility for a bucket to be invalidated by the allocator while moving_gc was copying it's contents to another bucket, if the bucket only held cached data. To prevent this moving checks for a stale ptr (to an invalidated bucket), before and after reads. It it finds one, it simply ignores moving that data. This only affects bcache if the moving_gc was turned on, note that it's off by default. Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/movinggc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 30f347d4e609..f2f0998c4a91 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -64,11 +64,16 @@ static void write_moving_finish(struct closure *cl) static void read_moving_endio(struct bio *bio, int error) { + struct bbio *b = container_of(bio, struct bbio, bio); struct moving_io *io = container_of(bio->bi_private, struct moving_io, cl); if (error) io->op.error = error; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(io->op.c, &b->key, 0)) { + io->op.error = -EINTR; + } bch_bbio_endio(io->op.c, bio, error, "reading data to move"); } @@ -140,6 +145,11 @@ static void read_moving(struct cache_set *c) if (!w) break; + if (ptr_stale(c, &w->key, 0)) { + bch_keybuf_del(&c->moving_gc_keys, w); + continue; + } + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), GFP_KERNEL); -- cgit v1.2.2 From 16749c23c00c686ed168471963e3ddb0f3fcd855 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2013 13:58:34 -0800 Subject: bcache: New writeback PD controller The old writeback PD controller could get into states where it had throttled all the way down and take way too long to recover - it was too complicated to really understand what it was doing. This rewrites a good chunk of it to hopefully be simpler and make more sense, and it also pays more attention to units which should make the behaviour a bit easier to understand. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 6 +++--- drivers/md/bcache/sysfs.c | 50 +++++++++++++++++++++++++------------------ drivers/md/bcache/util.c | 8 ++++++- drivers/md/bcache/writeback.c | 47 ++++++++++++++++++++-------------------- 4 files changed, 62 insertions(+), 49 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index a7b1a7631ed2..754f43177483 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -373,14 +373,14 @@ struct cached_dev { unsigned char writeback_percent; unsigned writeback_delay; - int writeback_rate_change; - int64_t writeback_rate_derivative; uint64_t writeback_rate_target; + int64_t writeback_rate_proportional; + int64_t writeback_rate_derivative; + int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_d_term; unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_d_smooth; }; enum alloc_watermarks { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 80d4c2bee18a..a1f85612f0b3 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -83,7 +83,6 @@ rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_d_term); rw_attribute(writeback_rate_p_term_inverse); -rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -129,31 +128,41 @@ SHOW(__bch_cached_dev) var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_print(writeback_rate, dc->writeback_rate.rate); + sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); var_print(writeback_rate_d_term); var_print(writeback_rate_p_term_inverse); - var_print(writeback_rate_d_smooth); if (attr == &sysfs_writeback_rate_debug) { + char rate[20]; char dirty[20]; - char derivative[20]; char target[20]; - bch_hprint(dirty, - bcache_dev_sectors_dirty(&dc->disk) << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + char proportional[20]; + char derivative[20]; + char change[20]; + s64 next_io; + + bch_hprint(rate, dc->writeback_rate.rate << 9); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional,dc->writeback_rate_proportional << 9); + bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(change, dc->writeback_rate_change << 9); + + next_io = div64_s64(dc->writeback_rate.next - local_clock(), + NSEC_PER_MSEC); return sprintf(buf, - "rate:\t\t%u\n" - "change:\t\t%i\n" + "rate:\t\t%s/sec\n" "dirty:\t\t%s\n" + "target:\t\t%s\n" + "proportional:\t%s\n" "derivative:\t%s\n" - "target:\t\t%s\n", - dc->writeback_rate.rate, - dc->writeback_rate_change, - dirty, derivative, target); + "change:\t\t%s/sec\n" + "next io:\t%llims\n", + rate, dirty, target, proportional, + derivative, change, next_io); } sysfs_hprint(dirty_data, @@ -189,6 +198,7 @@ STORE(__cached_dev) struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) +#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) sysfs_strtoul(data_csum, dc->disk.data_csum); @@ -197,16 +207,15 @@ STORE(__cached_dev) d_strtoul(writeback_metadata); d_strtoul(writeback_running); d_strtoul(writeback_delay); - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, 1000000); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - d_strtoul(writeback_rate_update_seconds); + sysfs_strtoul_clamp(writeback_rate, + dc->writeback_rate.rate, 1, INT_MAX); + + d_strtoul_nonzero(writeback_rate_update_seconds); d_strtoul(writeback_rate_d_term); - d_strtoul(writeback_rate_p_term_inverse); - sysfs_strtoul_clamp(writeback_rate_p_term_inverse, - dc->writeback_rate_p_term_inverse, 1, INT_MAX); - d_strtoul(writeback_rate_d_smooth); + d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -313,7 +322,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_update_seconds, &sysfs_writeback_rate_d_term, &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, &sysfs_stripe_size, diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 462214eeacbe..bb37618e7664 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -209,7 +209,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); - d->next += div_u64(done, d->rate); + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; return time_after64(d->next, now) ? div_u64(d->next - now, NSEC_PER_SEC / HZ) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3cd931d3f26c..6c44fe059c27 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -30,38 +30,40 @@ static void __update_writeback_rate(struct cached_dev *dc) /* PD controller */ - int change = 0; - int64_t error; int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t derivative = dirty - dc->disk.sectors_dirty_last; + int64_t proportional = dirty - target; + int64_t change; dc->disk.sectors_dirty_last = dirty; - derivative *= dc->writeback_rate_d_term; - derivative = clamp(derivative, -dirty, dirty); + /* Scale to sectors per second */ - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - dc->writeback_rate_d_smooth, 0); + proportional *= dc->writeback_rate_update_seconds; + proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - /* Avoid divide by zero */ - if (!target) - goto out; + derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - error = div64_s64((dirty + derivative - target) << 8, target); + derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, + (dc->writeback_rate_d_term / + dc->writeback_rate_update_seconds) ?: 1, 0); + + derivative *= dc->writeback_rate_d_term; + derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - change = div_s64((dc->writeback_rate.rate * error) >> 8, - dc->writeback_rate_p_term_inverse); + change = proportional + derivative; /* Don't increase writeback rate if the device isn't keeping up */ if (change > 0 && time_after64(local_clock(), - dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) + dc->writeback_rate.next + NSEC_PER_MSEC)) change = 0; dc->writeback_rate.rate = - clamp_t(int64_t, dc->writeback_rate.rate + change, + clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, 1, NSEC_PER_MSEC); -out: + + dc->writeback_rate_proportional = proportional; dc->writeback_rate_derivative = derivative; dc->writeback_rate_change = change; dc->writeback_rate_target = target; @@ -87,15 +89,11 @@ static void update_writeback_rate(struct work_struct *work) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { - uint64_t ret; - if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; - ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); - - return min_t(uint64_t, ret, HZ); + return bch_next_delay(&dc->writeback_rate, sectors); } struct dirty_io { @@ -476,6 +474,8 @@ void bch_sectors_dirty_init(struct cached_dev *dc) bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); + + dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); } int bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -490,10 +490,9 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; - dc->writeback_rate_update_seconds = 30; - dc->writeback_rate_d_term = 16; - dc->writeback_rate_p_term_inverse = 64; - dc->writeback_rate_d_smooth = 8; + dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_d_term = 30; + dc->writeback_rate_p_term_inverse = 6000; dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); -- cgit v1.2.2