aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaeho Jeong <daeho.jeong@samsung.com>2016-04-25 23:22:35 -0400
committerTheodore Ts'o <tytso@mit.edu>2016-04-25 23:22:35 -0400
commitc8585c6fcaf2011de54c3592e80a634a2b9e1a7f (patch)
tree66b440a3e710ab4b63ae20200735a594bffb74cd
parent4c54659269ecb799133758330e7ea2a6fa4c65ca (diff)
ext4: fix races between changing inode journal mode and ext4_writepages
In ext4, there is a race condition between changing inode journal mode and ext4_writepages(). While ext4_writepages() is executed on a non-journalled mode inode, the inode's journal mode could be enabled by ioctl() and then, some pages dirtied after switching the journal mode will be still exposed to ext4_writepages() in non-journaled mode. To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan Kara's suggestion because we don't want to waste ext4_inode_info's space for this extra rare case. Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Reviewed-by: Jan Kara <jack@suse.cz>
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/inode.c15
-rw-r--r--fs/ext4/super.c4
-rw-r--r--kernel/locking/percpu-rwsem.c1
4 files changed, 21 insertions, 3 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cb00b1119ec9..ba5aecc07fbc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,6 +33,7 @@
33#include <linux/ratelimit.h> 33#include <linux/ratelimit.h>
34#include <crypto/hash.h> 34#include <crypto/hash.h>
35#include <linux/falloc.h> 35#include <linux/falloc.h>
36#include <linux/percpu-rwsem.h>
36#ifdef __KERNEL__ 37#ifdef __KERNEL__
37#include <linux/compat.h> 38#include <linux/compat.h>
38#endif 39#endif
@@ -1508,6 +1509,9 @@ struct ext4_sb_info {
1508 struct ratelimit_state s_err_ratelimit_state; 1509 struct ratelimit_state s_err_ratelimit_state;
1509 struct ratelimit_state s_warning_ratelimit_state; 1510 struct ratelimit_state s_warning_ratelimit_state;
1510 struct ratelimit_state s_msg_ratelimit_state; 1511 struct ratelimit_state s_msg_ratelimit_state;
1512
1513 /* Barrier between changing inodes' journal flags and writepages ops. */
1514 struct percpu_rw_semaphore s_journal_flag_rwsem;
1511}; 1515};
1512 1516
1513static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1517static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 779ef4c11bc1..4d8ebbe00456 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2612,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
2612 struct blk_plug plug; 2612 struct blk_plug plug;
2613 bool give_up_on_write = false; 2613 bool give_up_on_write = false;
2614 2614
2615 percpu_down_read(&sbi->s_journal_flag_rwsem);
2615 trace_ext4_writepages(inode, wbc); 2616 trace_ext4_writepages(inode, wbc);
2616 2617
2617 if (dax_mapping(mapping)) 2618 if (dax_mapping(mapping)) {
2618 return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, 2619 ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
2619 wbc); 2620 wbc);
2621 goto out_writepages;
2622 }
2620 2623
2621 /* 2624 /*
2622 * No pages to write? This is mainly a kludge to avoid starting 2625 * No pages to write? This is mainly a kludge to avoid starting
@@ -2786,6 +2789,7 @@ retry:
2786out_writepages: 2789out_writepages:
2787 trace_ext4_writepages_result(inode, wbc, ret, 2790 trace_ext4_writepages_result(inode, wbc, ret,
2788 nr_to_write - wbc->nr_to_write); 2791 nr_to_write - wbc->nr_to_write);
2792 percpu_up_read(&sbi->s_journal_flag_rwsem);
2789 return ret; 2793 return ret;
2790} 2794}
2791 2795
@@ -5436,6 +5440,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5436 journal_t *journal; 5440 journal_t *journal;
5437 handle_t *handle; 5441 handle_t *handle;
5438 int err; 5442 int err;
5443 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5439 5444
5440 /* 5445 /*
5441 * We have to be very careful here: changing a data block's 5446 * We have to be very careful here: changing a data block's
@@ -5475,6 +5480,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5475 } 5480 }
5476 } 5481 }
5477 5482
5483 percpu_down_write(&sbi->s_journal_flag_rwsem);
5478 jbd2_journal_lock_updates(journal); 5484 jbd2_journal_lock_updates(journal);
5479 5485
5480 /* 5486 /*
@@ -5491,6 +5497,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5491 err = jbd2_journal_flush(journal); 5497 err = jbd2_journal_flush(journal);
5492 if (err < 0) { 5498 if (err < 0) {
5493 jbd2_journal_unlock_updates(journal); 5499 jbd2_journal_unlock_updates(journal);
5500 percpu_up_write(&sbi->s_journal_flag_rwsem);
5494 ext4_inode_resume_unlocked_dio(inode); 5501 ext4_inode_resume_unlocked_dio(inode);
5495 return err; 5502 return err;
5496 } 5503 }
@@ -5499,6 +5506,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5499 ext4_set_aops(inode); 5506 ext4_set_aops(inode);
5500 5507
5501 jbd2_journal_unlock_updates(journal); 5508 jbd2_journal_unlock_updates(journal);
5509 percpu_up_write(&sbi->s_journal_flag_rwsem);
5510
5502 if (val) 5511 if (val)
5503 up_write(&EXT4_I(inode)->i_mmap_sem); 5512 up_write(&EXT4_I(inode)->i_mmap_sem);
5504 ext4_inode_resume_unlocked_dio(inode); 5513 ext4_inode_resume_unlocked_dio(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 304c712dbe12..20c5d52253b4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
859 percpu_counter_destroy(&sbi->s_freeinodes_counter); 859 percpu_counter_destroy(&sbi->s_freeinodes_counter);
860 percpu_counter_destroy(&sbi->s_dirs_counter); 860 percpu_counter_destroy(&sbi->s_dirs_counter);
861 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 861 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
862 percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
862 brelse(sbi->s_sbh); 863 brelse(sbi->s_sbh);
863#ifdef CONFIG_QUOTA 864#ifdef CONFIG_QUOTA
864 for (i = 0; i < EXT4_MAXQUOTAS; i++) 865 for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3930,6 +3931,9 @@ no_journal:
3930 if (!err) 3931 if (!err)
3931 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 3932 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
3932 GFP_KERNEL); 3933 GFP_KERNEL);
3934 if (!err)
3935 err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
3936
3933 if (err) { 3937 if (err) {
3934 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3938 ext4_msg(sb, KERN_ERR, "insufficient memory");
3935 goto failed_mount6; 3939 goto failed_mount6;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0bb311c..bec0b647f9cc 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
37 free_percpu(brw->fast_read_ctr); 37 free_percpu(brw->fast_read_ctr);
38 brw->fast_read_ctr = NULL; /* catch use after free bugs */ 38 brw->fast_read_ctr = NULL; /* catch use after free bugs */
39} 39}
40EXPORT_SYMBOL_GPL(percpu_free_rwsem);
40 41
41/* 42/*
42 * This is the fast-path for down_read/up_read. If it succeeds we rely 43 * This is the fast-path for down_read/up_read. If it succeeds we rely