diff options
author | Jan Kara <jack@suse.cz> | 2018-06-18 09:46:58 -0400 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2018-06-22 14:08:07 -0400 |
commit | 3ee7e8697d5860b173132606d80a9cd35e7113ee (patch) | |
tree | 8612a44c612f094fac83ddbfa9eead7725b960f7 | |
parent | 0ae52ddf5bd7f685bb43d7687290f6c2eeacfb31 (diff) |
bdi: Fix another oops in wb_workfn()
syzbot is reporting NULL pointer dereference at wb_workfn() [1] due to
wb->bdi->dev being NULL. And Dmitry confirmed that wb->state was
WB_shutting_down after wb->bdi->dev became NULL. This indicates that
unregister_bdi() failed to call wb_shutdown() on one of wb objects.
The problem is in cgwb_bdi_unregister() which does cgwb_kill() and thus
drops bdi's reference to wb structures before going through the list of
wbs again and calling wb_shutdown() on each of them. This way the loop
iterating through all wbs can easily miss a wb if that wb has already
passed through cgwb_remove_from_bdi_list() called from wb_shutdown()
from cgwb_release_workfn() and as a result fully shutdown bdi although
wb_workfn() for this wb structure is still running. In fact there are
also other ways cgwb_bdi_unregister() can race with
cgwb_release_workfn() leading e.g. to use-after-free issues:
CPU1 CPU2
cgwb_bdi_unregister()
cgwb_kill(*slot);
cgwb_release()
queue_work(cgwb_release_wq, &wb->release_work);
cgwb_release_workfn()
wb = list_first_entry(&bdi->wb_list, ...)
spin_unlock_irq(&cgwb_lock);
wb_shutdown(wb);
...
kfree_rcu(wb, rcu);
wb_shutdown(wb); -> oops use-after-free
We solve these issues by synchronizing writeback structure shutdown from
cgwb_bdi_unregister() with cgwb_release_workfn() using a new mutex. That
way we also no longer need synchronization using WB_shutting_down as the
mutex provides it for CONFIG_CGROUP_WRITEBACK case and without
CONFIG_CGROUP_WRITEBACK wb_shutdown() can be called only once from
bdi_unregister().
Reported-by: syzbot <syzbot+4a7438e774b21ddd8eca@syzkaller.appspotmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r-- | include/linux/backing-dev-defs.h | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 20 |
2 files changed, 8 insertions, 14 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 0bd432a4d7bd..24251762c20c 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h | |||
@@ -22,7 +22,6 @@ struct dentry; | |||
22 | */ | 22 | */ |
23 | enum wb_state { | 23 | enum wb_state { |
24 | WB_registered, /* bdi_register() was done */ | 24 | WB_registered, /* bdi_register() was done */ |
25 | WB_shutting_down, /* wb_shutdown() in progress */ | ||
26 | WB_writeback_running, /* Writeback is in progress */ | 25 | WB_writeback_running, /* Writeback is in progress */ |
27 | WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ | 26 | WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ |
28 | WB_start_all, /* nr_pages == 0 (all) work pending */ | 27 | WB_start_all, /* nr_pages == 0 (all) work pending */ |
@@ -189,6 +188,7 @@ struct backing_dev_info { | |||
189 | #ifdef CONFIG_CGROUP_WRITEBACK | 188 | #ifdef CONFIG_CGROUP_WRITEBACK |
190 | struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ | 189 | struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ |
191 | struct rb_root cgwb_congested_tree; /* their congested states */ | 190 | struct rb_root cgwb_congested_tree; /* their congested states */ |
191 | struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ | ||
192 | #else | 192 | #else |
193 | struct bdi_writeback_congested *wb_congested; | 193 | struct bdi_writeback_congested *wb_congested; |
194 | #endif | 194 | #endif |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 347cc834c04a..2e5d3df0853d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb) | |||
359 | spin_lock_bh(&wb->work_lock); | 359 | spin_lock_bh(&wb->work_lock); |
360 | if (!test_and_clear_bit(WB_registered, &wb->state)) { | 360 | if (!test_and_clear_bit(WB_registered, &wb->state)) { |
361 | spin_unlock_bh(&wb->work_lock); | 361 | spin_unlock_bh(&wb->work_lock); |
362 | /* | ||
363 | * Wait for wb shutdown to finish if someone else is just | ||
364 | * running wb_shutdown(). Otherwise we could proceed to wb / | ||
365 | * bdi destruction before wb_shutdown() is finished. | ||
366 | */ | ||
367 | wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE); | ||
368 | return; | 362 | return; |
369 | } | 363 | } |
370 | set_bit(WB_shutting_down, &wb->state); | ||
371 | spin_unlock_bh(&wb->work_lock); | 364 | spin_unlock_bh(&wb->work_lock); |
372 | 365 | ||
373 | cgwb_remove_from_bdi_list(wb); | 366 | cgwb_remove_from_bdi_list(wb); |
@@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb) | |||
379 | mod_delayed_work(bdi_wq, &wb->dwork, 0); | 372 | mod_delayed_work(bdi_wq, &wb->dwork, 0); |
380 | flush_delayed_work(&wb->dwork); | 373 | flush_delayed_work(&wb->dwork); |
381 | WARN_ON(!list_empty(&wb->work_list)); | 374 | WARN_ON(!list_empty(&wb->work_list)); |
382 | /* | ||
383 | * Make sure bit gets cleared after shutdown is finished. Matches with | ||
384 | * the barrier provided by test_and_clear_bit() above. | ||
385 | */ | ||
386 | smp_wmb(); | ||
387 | clear_and_wake_up_bit(WB_shutting_down, &wb->state); | ||
388 | } | 375 | } |
389 | 376 | ||
390 | static void wb_exit(struct bdi_writeback *wb) | 377 | static void wb_exit(struct bdi_writeback *wb) |
@@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work) | |||
508 | struct bdi_writeback *wb = container_of(work, struct bdi_writeback, | 495 | struct bdi_writeback *wb = container_of(work, struct bdi_writeback, |
509 | release_work); | 496 | release_work); |
510 | 497 | ||
498 | mutex_lock(&wb->bdi->cgwb_release_mutex); | ||
511 | wb_shutdown(wb); | 499 | wb_shutdown(wb); |
512 | 500 | ||
513 | css_put(wb->memcg_css); | 501 | css_put(wb->memcg_css); |
514 | css_put(wb->blkcg_css); | 502 | css_put(wb->blkcg_css); |
503 | mutex_unlock(&wb->bdi->cgwb_release_mutex); | ||
515 | 504 | ||
516 | fprop_local_destroy_percpu(&wb->memcg_completions); | 505 | fprop_local_destroy_percpu(&wb->memcg_completions); |
517 | percpu_ref_exit(&wb->refcnt); | 506 | percpu_ref_exit(&wb->refcnt); |
@@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) | |||
697 | 686 | ||
698 | INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); | 687 | INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); |
699 | bdi->cgwb_congested_tree = RB_ROOT; | 688 | bdi->cgwb_congested_tree = RB_ROOT; |
689 | mutex_init(&bdi->cgwb_release_mutex); | ||
700 | 690 | ||
701 | ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); | 691 | ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); |
702 | if (!ret) { | 692 | if (!ret) { |
@@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) | |||
717 | spin_lock_irq(&cgwb_lock); | 707 | spin_lock_irq(&cgwb_lock); |
718 | radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) | 708 | radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) |
719 | cgwb_kill(*slot); | 709 | cgwb_kill(*slot); |
710 | spin_unlock_irq(&cgwb_lock); | ||
720 | 711 | ||
712 | mutex_lock(&bdi->cgwb_release_mutex); | ||
713 | spin_lock_irq(&cgwb_lock); | ||
721 | while (!list_empty(&bdi->wb_list)) { | 714 | while (!list_empty(&bdi->wb_list)) { |
722 | wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, | 715 | wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, |
723 | bdi_node); | 716 | bdi_node); |
@@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) | |||
726 | spin_lock_irq(&cgwb_lock); | 719 | spin_lock_irq(&cgwb_lock); |
727 | } | 720 | } |
728 | spin_unlock_irq(&cgwb_lock); | 721 | spin_unlock_irq(&cgwb_lock); |
722 | mutex_unlock(&bdi->cgwb_release_mutex); | ||
729 | } | 723 | } |
730 | 724 | ||
731 | /** | 725 | /** |