summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2018-06-18 09:46:58 -0400
committerJens Axboe <axboe@kernel.dk>2018-06-22 14:08:07 -0400
commit3ee7e8697d5860b173132606d80a9cd35e7113ee (patch)
tree8612a44c612f094fac83ddbfa9eead7725b960f7
parent0ae52ddf5bd7f685bb43d7687290f6c2eeacfb31 (diff)
bdi: Fix another oops in wb_workfn()
syzbot is reporting NULL pointer dereference at wb_workfn() [1] due to wb->bdi->dev being NULL. And Dmitry confirmed that wb->state was WB_shutting_down after wb->bdi->dev became NULL. This indicates that unregister_bdi() failed to call wb_shutdown() on one of wb objects. The problem is in cgwb_bdi_unregister() which does cgwb_kill() and thus drops bdi's reference to wb structures before going through the list of wbs again and calling wb_shutdown() on each of them. This way the loop iterating through all wbs can easily miss a wb if that wb has already passed through cgwb_remove_from_bdi_list() called from wb_shutdown() from cgwb_release_workfn() and as a result fully shutdown bdi although wb_workfn() for this wb structure is still running. In fact there are also other ways cgwb_bdi_unregister() can race with cgwb_release_workfn() leading e.g. to use-after-free issues: CPU1 CPU2 cgwb_bdi_unregister() cgwb_kill(*slot); cgwb_release() queue_work(cgwb_release_wq, &wb->release_work); cgwb_release_workfn() wb = list_first_entry(&bdi->wb_list, ...) spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); ... kfree_rcu(wb, rcu); wb_shutdown(wb); -> oops use-after-free We solve these issues by synchronizing writeback structure shutdown from cgwb_bdi_unregister() with cgwb_release_workfn() using a new mutex. That way we also no longer need synchronization using WB_shutting_down as the mutex provides it for CONFIG_CGROUP_WRITEBACK case and without CONFIG_CGROUP_WRITEBACK wb_shutdown() can be called only once from bdi_unregister(). Reported-by: syzbot <syzbot+4a7438e774b21ddd8eca@syzkaller.appspotmail.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--include/linux/backing-dev-defs.h2
-rw-r--r--mm/backing-dev.c20
2 files changed, 8 insertions, 14 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 0bd432a4d7bd..24251762c20c 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -22,7 +22,6 @@ struct dentry;
22 */ 22 */
23enum wb_state { 23enum wb_state {
24 WB_registered, /* bdi_register() was done */ 24 WB_registered, /* bdi_register() was done */
25 WB_shutting_down, /* wb_shutdown() in progress */
26 WB_writeback_running, /* Writeback is in progress */ 25 WB_writeback_running, /* Writeback is in progress */
27 WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ 26 WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
28 WB_start_all, /* nr_pages == 0 (all) work pending */ 27 WB_start_all, /* nr_pages == 0 (all) work pending */
@@ -189,6 +188,7 @@ struct backing_dev_info {
189#ifdef CONFIG_CGROUP_WRITEBACK 188#ifdef CONFIG_CGROUP_WRITEBACK
190 struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ 189 struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
191 struct rb_root cgwb_congested_tree; /* their congested states */ 190 struct rb_root cgwb_congested_tree; /* their congested states */
191 struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
192#else 192#else
193 struct bdi_writeback_congested *wb_congested; 193 struct bdi_writeback_congested *wb_congested;
194#endif 194#endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 347cc834c04a..2e5d3df0853d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb)
359 spin_lock_bh(&wb->work_lock); 359 spin_lock_bh(&wb->work_lock);
360 if (!test_and_clear_bit(WB_registered, &wb->state)) { 360 if (!test_and_clear_bit(WB_registered, &wb->state)) {
361 spin_unlock_bh(&wb->work_lock); 361 spin_unlock_bh(&wb->work_lock);
362 /*
363 * Wait for wb shutdown to finish if someone else is just
364 * running wb_shutdown(). Otherwise we could proceed to wb /
365 * bdi destruction before wb_shutdown() is finished.
366 */
367 wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
368 return; 362 return;
369 } 363 }
370 set_bit(WB_shutting_down, &wb->state);
371 spin_unlock_bh(&wb->work_lock); 364 spin_unlock_bh(&wb->work_lock);
372 365
373 cgwb_remove_from_bdi_list(wb); 366 cgwb_remove_from_bdi_list(wb);
@@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb)
379 mod_delayed_work(bdi_wq, &wb->dwork, 0); 372 mod_delayed_work(bdi_wq, &wb->dwork, 0);
380 flush_delayed_work(&wb->dwork); 373 flush_delayed_work(&wb->dwork);
381 WARN_ON(!list_empty(&wb->work_list)); 374 WARN_ON(!list_empty(&wb->work_list));
382 /*
383 * Make sure bit gets cleared after shutdown is finished. Matches with
384 * the barrier provided by test_and_clear_bit() above.
385 */
386 smp_wmb();
387 clear_and_wake_up_bit(WB_shutting_down, &wb->state);
388} 375}
389 376
390static void wb_exit(struct bdi_writeback *wb) 377static void wb_exit(struct bdi_writeback *wb)
@@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work)
508 struct bdi_writeback *wb = container_of(work, struct bdi_writeback, 495 struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
509 release_work); 496 release_work);
510 497
498 mutex_lock(&wb->bdi->cgwb_release_mutex);
511 wb_shutdown(wb); 499 wb_shutdown(wb);
512 500
513 css_put(wb->memcg_css); 501 css_put(wb->memcg_css);
514 css_put(wb->blkcg_css); 502 css_put(wb->blkcg_css);
503 mutex_unlock(&wb->bdi->cgwb_release_mutex);
515 504
516 fprop_local_destroy_percpu(&wb->memcg_completions); 505 fprop_local_destroy_percpu(&wb->memcg_completions);
517 percpu_ref_exit(&wb->refcnt); 506 percpu_ref_exit(&wb->refcnt);
@@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
697 686
698 INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); 687 INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
699 bdi->cgwb_congested_tree = RB_ROOT; 688 bdi->cgwb_congested_tree = RB_ROOT;
689 mutex_init(&bdi->cgwb_release_mutex);
700 690
701 ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); 691 ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
702 if (!ret) { 692 if (!ret) {
@@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
717 spin_lock_irq(&cgwb_lock); 707 spin_lock_irq(&cgwb_lock);
718 radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) 708 radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
719 cgwb_kill(*slot); 709 cgwb_kill(*slot);
710 spin_unlock_irq(&cgwb_lock);
720 711
712 mutex_lock(&bdi->cgwb_release_mutex);
713 spin_lock_irq(&cgwb_lock);
721 while (!list_empty(&bdi->wb_list)) { 714 while (!list_empty(&bdi->wb_list)) {
722 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, 715 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
723 bdi_node); 716 bdi_node);
@@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
726 spin_lock_irq(&cgwb_lock); 719 spin_lock_irq(&cgwb_lock);
727 } 720 }
728 spin_unlock_irq(&cgwb_lock); 721 spin_unlock_irq(&cgwb_lock);
722 mutex_unlock(&bdi->cgwb_release_mutex);
729} 723}
730 724
731/** 725/**