diff options
| author | Daeho Jeong <daeho.jeong@samsung.com> | 2016-04-25 23:22:35 -0400 |
|---|---|---|
| committer | Theodore Ts'o <tytso@mit.edu> | 2016-04-25 23:22:35 -0400 |
| commit | c8585c6fcaf2011de54c3592e80a634a2b9e1a7f (patch) | |
| tree | 66b440a3e710ab4b63ae20200735a594bffb74cd | |
| parent | 4c54659269ecb799133758330e7ea2a6fa4c65ca (diff) | |
ext4: fix races between changing inode journal mode and ext4_writepages
In ext4, there is a race condition between changing inode journal mode
and ext4_writepages(). While ext4_writepages() is executed on a
non-journalled mode inode, the inode's journal mode could be enabled
by ioctl() and then, some pages dirtied after switching the journal
mode will be still exposed to ext4_writepages() in non-journaled mode.
To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan
Kara's suggestion because we don't want to waste ext4_inode_info's
space for this extra rare case.
Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
| -rw-r--r-- | fs/ext4/ext4.h | 4 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 15 | ||||
| -rw-r--r-- | fs/ext4/super.c | 4 | ||||
| -rw-r--r-- | kernel/locking/percpu-rwsem.c | 1 |
4 files changed, 21 insertions, 3 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cb00b1119ec9..ba5aecc07fbc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/ratelimit.h> | 33 | #include <linux/ratelimit.h> |
| 34 | #include <crypto/hash.h> | 34 | #include <crypto/hash.h> |
| 35 | #include <linux/falloc.h> | 35 | #include <linux/falloc.h> |
| 36 | #include <linux/percpu-rwsem.h> | ||
| 36 | #ifdef __KERNEL__ | 37 | #ifdef __KERNEL__ |
| 37 | #include <linux/compat.h> | 38 | #include <linux/compat.h> |
| 38 | #endif | 39 | #endif |
| @@ -1508,6 +1509,9 @@ struct ext4_sb_info { | |||
| 1508 | struct ratelimit_state s_err_ratelimit_state; | 1509 | struct ratelimit_state s_err_ratelimit_state; |
| 1509 | struct ratelimit_state s_warning_ratelimit_state; | 1510 | struct ratelimit_state s_warning_ratelimit_state; |
| 1510 | struct ratelimit_state s_msg_ratelimit_state; | 1511 | struct ratelimit_state s_msg_ratelimit_state; |
| 1512 | |||
| 1513 | /* Barrier between changing inodes' journal flags and writepages ops. */ | ||
| 1514 | struct percpu_rw_semaphore s_journal_flag_rwsem; | ||
| 1511 | }; | 1515 | }; |
| 1512 | 1516 | ||
| 1513 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) | 1517 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 779ef4c11bc1..4d8ebbe00456 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
| @@ -2612,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping, | |||
| 2612 | struct blk_plug plug; | 2612 | struct blk_plug plug; |
| 2613 | bool give_up_on_write = false; | 2613 | bool give_up_on_write = false; |
| 2614 | 2614 | ||
| 2615 | percpu_down_read(&sbi->s_journal_flag_rwsem); | ||
| 2615 | trace_ext4_writepages(inode, wbc); | 2616 | trace_ext4_writepages(inode, wbc); |
| 2616 | 2617 | ||
| 2617 | if (dax_mapping(mapping)) | 2618 | if (dax_mapping(mapping)) { |
| 2618 | return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, | 2619 | ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, |
| 2619 | wbc); | 2620 | wbc); |
| 2621 | goto out_writepages; | ||
| 2622 | } | ||
| 2620 | 2623 | ||
| 2621 | /* | 2624 | /* |
| 2622 | * No pages to write? This is mainly a kludge to avoid starting | 2625 | * No pages to write? This is mainly a kludge to avoid starting |
| @@ -2786,6 +2789,7 @@ retry: | |||
| 2786 | out_writepages: | 2789 | out_writepages: |
| 2787 | trace_ext4_writepages_result(inode, wbc, ret, | 2790 | trace_ext4_writepages_result(inode, wbc, ret, |
| 2788 | nr_to_write - wbc->nr_to_write); | 2791 | nr_to_write - wbc->nr_to_write); |
| 2792 | percpu_up_read(&sbi->s_journal_flag_rwsem); | ||
| 2789 | return ret; | 2793 | return ret; |
| 2790 | } | 2794 | } |
| 2791 | 2795 | ||
| @@ -5436,6 +5440,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |||
| 5436 | journal_t *journal; | 5440 | journal_t *journal; |
| 5437 | handle_t *handle; | 5441 | handle_t *handle; |
| 5438 | int err; | 5442 | int err; |
| 5443 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 5439 | 5444 | ||
| 5440 | /* | 5445 | /* |
| 5441 | * We have to be very careful here: changing a data block's | 5446 | * We have to be very careful here: changing a data block's |
| @@ -5475,6 +5480,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |||
| 5475 | } | 5480 | } |
| 5476 | } | 5481 | } |
| 5477 | 5482 | ||
| 5483 | percpu_down_write(&sbi->s_journal_flag_rwsem); | ||
| 5478 | jbd2_journal_lock_updates(journal); | 5484 | jbd2_journal_lock_updates(journal); |
| 5479 | 5485 | ||
| 5480 | /* | 5486 | /* |
| @@ -5491,6 +5497,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |||
| 5491 | err = jbd2_journal_flush(journal); | 5497 | err = jbd2_journal_flush(journal); |
| 5492 | if (err < 0) { | 5498 | if (err < 0) { |
| 5493 | jbd2_journal_unlock_updates(journal); | 5499 | jbd2_journal_unlock_updates(journal); |
| 5500 | percpu_up_write(&sbi->s_journal_flag_rwsem); | ||
| 5494 | ext4_inode_resume_unlocked_dio(inode); | 5501 | ext4_inode_resume_unlocked_dio(inode); |
| 5495 | return err; | 5502 | return err; |
| 5496 | } | 5503 | } |
| @@ -5499,6 +5506,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |||
| 5499 | ext4_set_aops(inode); | 5506 | ext4_set_aops(inode); |
| 5500 | 5507 | ||
| 5501 | jbd2_journal_unlock_updates(journal); | 5508 | jbd2_journal_unlock_updates(journal); |
| 5509 | percpu_up_write(&sbi->s_journal_flag_rwsem); | ||
| 5510 | |||
| 5502 | if (val) | 5511 | if (val) |
| 5503 | up_write(&EXT4_I(inode)->i_mmap_sem); | 5512 | up_write(&EXT4_I(inode)->i_mmap_sem); |
| 5504 | ext4_inode_resume_unlocked_dio(inode); | 5513 | ext4_inode_resume_unlocked_dio(inode); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 304c712dbe12..20c5d52253b4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
| @@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb) | |||
| 859 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 859 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
| 860 | percpu_counter_destroy(&sbi->s_dirs_counter); | 860 | percpu_counter_destroy(&sbi->s_dirs_counter); |
| 861 | percpu_counter_destroy(&sbi->s_dirtyclusters_counter); | 861 | percpu_counter_destroy(&sbi->s_dirtyclusters_counter); |
| 862 | percpu_free_rwsem(&sbi->s_journal_flag_rwsem); | ||
| 862 | brelse(sbi->s_sbh); | 863 | brelse(sbi->s_sbh); |
| 863 | #ifdef CONFIG_QUOTA | 864 | #ifdef CONFIG_QUOTA |
| 864 | for (i = 0; i < EXT4_MAXQUOTAS; i++) | 865 | for (i = 0; i < EXT4_MAXQUOTAS; i++) |
| @@ -3930,6 +3931,9 @@ no_journal: | |||
| 3930 | if (!err) | 3931 | if (!err) |
| 3931 | err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, | 3932 | err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, |
| 3932 | GFP_KERNEL); | 3933 | GFP_KERNEL); |
| 3934 | if (!err) | ||
| 3935 | err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); | ||
| 3936 | |||
| 3933 | if (err) { | 3937 | if (err) { |
| 3934 | ext4_msg(sb, KERN_ERR, "insufficient memory"); | 3938 | ext4_msg(sb, KERN_ERR, "insufficient memory"); |
| 3935 | goto failed_mount6; | 3939 | goto failed_mount6; |
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..bec0b647f9cc 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c | |||
| @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) | |||
| 37 | free_percpu(brw->fast_read_ctr); | 37 | free_percpu(brw->fast_read_ctr); |
| 38 | brw->fast_read_ctr = NULL; /* catch use after free bugs */ | 38 | brw->fast_read_ctr = NULL; /* catch use after free bugs */ |
| 39 | } | 39 | } |
| 40 | EXPORT_SYMBOL_GPL(percpu_free_rwsem); | ||
| 40 | 41 | ||
| 41 | /* | 42 | /* |
| 42 | * This is the fast-path for down_read/up_read. If it succeeds we rely | 43 | * This is the fast-path for down_read/up_read. If it succeeds we rely |
