diff options
author | Tejun Heo <tj@kernel.org> | 2016-02-29 18:28:53 -0500 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2016-03-03 16:42:50 -0500 |
commit | a1a0e23e49037c23ea84bc8cc146a03584d13577 (patch) | |
tree | b8bb7792ed9dd97358bdea0a6a2f36fdb617bbf0 /fs | |
parent | e9fc63d682dbbef17921aeb00d03fd52d6735ffd (diff) |
writeback: flush inode cgroup wb switches instead of pinning super_block
If cgroup writeback is in use, inodes can be scheduled for
asynchronous wb switching. Before 5ff8eaac1636 ("writeback: keep
superblock pinned during cgroup writeback association switches"), this
could race with umount leading to super_block being destroyed while
inodes are pinned for wb switching. 5ff8eaac1636 fixed it by bumping
s_active while wb switches are in flight; however, this allowed
in-flight wb switches to make umounts asynchronous when the userland
expected synchronosity - e.g. fsck immediately following umount may
fail because the device is still busy.
This patch removes the problematic super_block pinning and instead
makes generic_shutdown_super() flush in-flight wb switches. wb
switches are now executed on a dedicated isw_wq so that they can be
flushed and isw_nr_in_flight keeps track of the number of in-flight wb
switches so that flushing can be avoided in most cases.
v2: Move cgroup_writeback_umount() further below and add MS_ACTIVE
check in inode_switch_wbs() as Jan an Al suggested.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Tahsin Erdogan <tahsin@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Link: http://lkml.kernel.org/g/CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A@mail.gmail.com
Fixes: 5ff8eaac1636 ("writeback: keep superblock pinned during cgroup writeback association switches")
Cc: stable@vger.kernel.org #v4.5
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/fs-writeback.c | 54 | ||||
-rw-r--r-- | fs/super.c | 1 |
2 files changed, 42 insertions, 13 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1f76d8950a57..5c46ed9f3e14 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, | |||
223 | #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) | 223 | #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) |
224 | /* one round can affect upto 5 slots */ | 224 | /* one round can affect upto 5 slots */ |
225 | 225 | ||
226 | static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); | ||
227 | static struct workqueue_struct *isw_wq; | ||
228 | |||
226 | void __inode_attach_wb(struct inode *inode, struct page *page) | 229 | void __inode_attach_wb(struct inode *inode, struct page *page) |
227 | { | 230 | { |
228 | struct backing_dev_info *bdi = inode_to_bdi(inode); | 231 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) | |||
317 | struct inode_switch_wbs_context *isw = | 320 | struct inode_switch_wbs_context *isw = |
318 | container_of(work, struct inode_switch_wbs_context, work); | 321 | container_of(work, struct inode_switch_wbs_context, work); |
319 | struct inode *inode = isw->inode; | 322 | struct inode *inode = isw->inode; |
320 | struct super_block *sb = inode->i_sb; | ||
321 | struct address_space *mapping = inode->i_mapping; | 323 | struct address_space *mapping = inode->i_mapping; |
322 | struct bdi_writeback *old_wb = inode->i_wb; | 324 | struct bdi_writeback *old_wb = inode->i_wb; |
323 | struct bdi_writeback *new_wb = isw->new_wb; | 325 | struct bdi_writeback *new_wb = isw->new_wb; |
@@ -424,8 +426,9 @@ skip_switch: | |||
424 | wb_put(new_wb); | 426 | wb_put(new_wb); |
425 | 427 | ||
426 | iput(inode); | 428 | iput(inode); |
427 | deactivate_super(sb); | ||
428 | kfree(isw); | 429 | kfree(isw); |
430 | |||
431 | atomic_dec(&isw_nr_in_flight); | ||
429 | } | 432 | } |
430 | 433 | ||
431 | static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) | 434 | static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) |
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) | |||
435 | 438 | ||
436 | /* needs to grab bh-unsafe locks, bounce to work item */ | 439 | /* needs to grab bh-unsafe locks, bounce to work item */ |
437 | INIT_WORK(&isw->work, inode_switch_wbs_work_fn); | 440 | INIT_WORK(&isw->work, inode_switch_wbs_work_fn); |
438 | schedule_work(&isw->work); | 441 | queue_work(isw_wq, &isw->work); |
439 | } | 442 | } |
440 | 443 | ||
441 | /** | 444 | /** |
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) | |||
471 | 474 | ||
472 | /* while holding I_WB_SWITCH, no one else can update the association */ | 475 | /* while holding I_WB_SWITCH, no one else can update the association */ |
473 | spin_lock(&inode->i_lock); | 476 | spin_lock(&inode->i_lock); |
474 | 477 | if (!(inode->i_sb->s_flags & MS_ACTIVE) || | |
475 | if (inode->i_state & (I_WB_SWITCH | I_FREEING) || | 478 | inode->i_state & (I_WB_SWITCH | I_FREEING) || |
476 | inode_to_wb(inode) == isw->new_wb) | 479 | inode_to_wb(inode) == isw->new_wb) { |
477 | goto out_unlock; | 480 | spin_unlock(&inode->i_lock); |
478 | 481 | goto out_free; | |
479 | if (!atomic_inc_not_zero(&inode->i_sb->s_active)) | 482 | } |
480 | goto out_unlock; | ||
481 | |||
482 | inode->i_state |= I_WB_SWITCH; | 483 | inode->i_state |= I_WB_SWITCH; |
483 | spin_unlock(&inode->i_lock); | 484 | spin_unlock(&inode->i_lock); |
484 | 485 | ||
485 | ihold(inode); | 486 | ihold(inode); |
486 | isw->inode = inode; | 487 | isw->inode = inode; |
487 | 488 | ||
489 | atomic_inc(&isw_nr_in_flight); | ||
490 | |||
488 | /* | 491 | /* |
489 | * In addition to synchronizing among switchers, I_WB_SWITCH tells | 492 | * In addition to synchronizing among switchers, I_WB_SWITCH tells |
490 | * the RCU protected stat update paths to grab the mapping's | 493 | * the RCU protected stat update paths to grab the mapping's |
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) | |||
494 | call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); | 497 | call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); |
495 | return; | 498 | return; |
496 | 499 | ||
497 | out_unlock: | ||
498 | spin_unlock(&inode->i_lock); | ||
499 | out_free: | 500 | out_free: |
500 | if (isw->new_wb) | 501 | if (isw->new_wb) |
501 | wb_put(isw->new_wb); | 502 | wb_put(isw->new_wb); |
@@ -847,6 +848,33 @@ restart: | |||
847 | wb_put(last_wb); | 848 | wb_put(last_wb); |
848 | } | 849 | } |
849 | 850 | ||
851 | /** | ||
852 | * cgroup_writeback_umount - flush inode wb switches for umount | ||
853 | * | ||
854 | * This function is called when a super_block is about to be destroyed and | ||
855 | * flushes in-flight inode wb switches. An inode wb switch goes through | ||
856 | * RCU and then workqueue, so the two need to be flushed in order to ensure | ||
857 | * that all previously scheduled switches are finished. As wb switches are | ||
858 | * rare occurrences and synchronize_rcu() can take a while, perform | ||
859 | * flushing iff wb switches are in flight. | ||
860 | */ | ||
861 | void cgroup_writeback_umount(void) | ||
862 | { | ||
863 | if (atomic_read(&isw_nr_in_flight)) { | ||
864 | synchronize_rcu(); | ||
865 | flush_workqueue(isw_wq); | ||
866 | } | ||
867 | } | ||
868 | |||
869 | static int __init cgroup_writeback_init(void) | ||
870 | { | ||
871 | isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); | ||
872 | if (!isw_wq) | ||
873 | return -ENOMEM; | ||
874 | return 0; | ||
875 | } | ||
876 | fs_initcall(cgroup_writeback_init); | ||
877 | |||
850 | #else /* CONFIG_CGROUP_WRITEBACK */ | 878 | #else /* CONFIG_CGROUP_WRITEBACK */ |
851 | 879 | ||
852 | static struct bdi_writeback * | 880 | static struct bdi_writeback * |
diff --git a/fs/super.c b/fs/super.c index 1182af8fd5ff..74914b1bae70 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb) | |||
415 | sb->s_flags &= ~MS_ACTIVE; | 415 | sb->s_flags &= ~MS_ACTIVE; |
416 | 416 | ||
417 | fsnotify_unmount_inodes(sb); | 417 | fsnotify_unmount_inodes(sb); |
418 | cgroup_writeback_umount(); | ||
418 | 419 | ||
419 | evict_inodes(sb); | 420 | evict_inodes(sb); |
420 | 421 | ||