From 39e3ac2599a5f9aba499b5f8af809108e70a6163 Mon Sep 17 00:00:00 2001 From: Brian King Date: Wed, 27 Oct 2010 21:25:12 -0400 Subject: jbd2: Fix I/O hang in jbd2_journal_release_jbd_inode This fixes a hang seen in jbd2_journal_release_jbd_inode on a lot of Power 6 systems running with ext4. When we get in the hung state, all I/O to the disk in question gets blocked where we stay indefinitely. Looking at the task list, I can see we are stuck in jbd2_journal_release_jbd_inode waiting on a wake up. I added some debug code to detect this scenario and dump additional data if we were stuck in jbd2_journal_release_jbd_inode for longer than 30 minutes. When it hit, I was able to see that i_flags was 0, suggesting we missed the wake up. This patch changes i_flags to be an unsigned long, uses bit operators to access it, and adds barriers around the accesses. Prior to applying this patch, we were regularly hitting this hang on numerous systems in our test environment. After applying the patch, the hangs no longer occur. Signed-off-by: Brian King Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0b52924a0cb6..2ae86aa21fce 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -395,7 +395,7 @@ struct jbd2_inode { struct inode *i_vfs_inode; /* Flags of inode [j_list_lock] */ - unsigned int i_flags; + unsigned long i_flags; }; struct jbd2_revoke_table_s; -- cgit v1.2.2 From e6fa0be699449d28a20e815bfe9ce26725ec4962 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 27 Oct 2010 21:30:04 -0400 Subject: Add helper function for blkdev_issue_zeroout (sb_issue_discard) This is done the same way as helper sb_issue_discard for blkdev_issue_discard. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- include/linux/blkdev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2c54906f678f..e5cb4d029689 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -941,6 +941,14 @@ static inline int sb_issue_discard(struct super_block *sb, return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS, BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); } +static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, + sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) +{ + return blkdev_issue_zeroout(sb->s_bdev, + block << (sb->s_blocksize_bits - 9), + nr_blocks << (sb->s_blocksize_bits - 9), + gfp_mask, flags); +} extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); -- cgit v1.2.2 From 367a51a339020ba4d9edb0ce0f21d65bd50b00c9 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 27 Oct 2010 21:30:11 -0400 Subject: fs: Add FITRIM ioctl Adds an filesystem independent ioctl to allow implementation of file system batched discard support. I takes fstrim_range structure as an argument. fstrim_range is definec in the include/fs.h and its definition is as follows. struct fstrim_range { start; len; minlen; } start - first Byte to trim len - number of Bytes to trim from start minlen - minimum extent length to trim, free extents shorter than this number of Bytes will be ignored. This will be rounded up to fs block size. It is also possible to specify NULL as an argument. In this case the arguments will set itself as follows: start = 0; len = ULLONG_MAX; minlen = 0; So it will trim the whole file system at one run. After the FITRIM is done, the number of actually discarded Bytes is stored in fstrim_range.len to give the user better insight on how much storage space has been really released for wear-leveling. Signed-off-by: Lukas Czerner Reviewed-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- include/linux/fs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 63d069bd80b7..7008268e9b5a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -32,6 +32,12 @@ #define SEEK_END 2 /* seek relative to end of file */ #define SEEK_MAX SEEK_END +struct fstrim_range { + uint64_t start; + uint64_t len; + uint64_t minlen; +}; + /* And dynamically-tunable limits and defaults: */ struct files_stat_struct { int nr_files; /* read only */ @@ -316,6 +322,7 @@ struct inodes_stat_t { #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ #define FIFREEZE _IOWR('X', 119, int) /* Freeze */ #define FITHAW _IOWR('X', 120, int) /* Thaw */ +#define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */ #define FS_IOC_GETFLAGS _IOR('f', 1, long) #define FS_IOC_SETFLAGS _IOW('f', 2, long) @@ -1581,6 +1588,7 @@ struct super_operations { ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); + int (*trim_fs) (struct super_block *, struct fstrim_range *); }; /* -- cgit v1.2.2 From 5b41d92437f1ae19b3f3ffa3b16589fd5df50ac0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:13 -0400 Subject: ext4: implement writeback livelock avoidance using page tagging This is analogous to Jan Kara's commit, f446daaea9d4a420d16c606f755f3689dcb2d0ce mm: implement writeback livelock avoidance using page tagging but since we forked write_cache_pages, we need to reimplement it there (and in ext4_da_writepages, since range_cyclic handling was moved to there) If you start a large buffered IO to a file, and then set fsync after it, you'll find that fsync does not complete until the other IO stops. If you continue re-dirtying the file (say, putting dd with conv=notrunc in a loop), when fsync finally completes (after all IO is done), it reports via tracing that it has written many more pages than the file contains; in other words it has synced and re-synced pages in the file multiple times. This then leads to problems with our writeback_index update, since it advances it by pages written, and essentially sets writeback_index off the end of the file... With the following patch, we only sync as much as was dirty at the time of the sync. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- include/linux/writeback.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 72a5d647a5f2..3d132bfb4f3d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -143,6 +143,8 @@ typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, int generic_writepages(struct address_space *mapping, struct writeback_control *wbc); +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); -- cgit v1.2.2 From 7f93cff90fa9be6ed45f6189e136153d1d8631b0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 21:30:13 -0400 Subject: ext4: fix kernel oops if the journal superblock has a non-zero j_errno Commit 84061e0 fixed an accounting bug only to introduce the possibility of a kernel OOPS if the journal has a non-zero j_errno field indicating that the file system had detected a fs inconsistency. After the journal replay, if the journal superblock indicates that the file system has an error, this indication is transfered to the file system and then ext4_commit_super() is called to write this to the disk. But since the percpu counters are now initialized after the journal replay, the call to ext4_commit_super() will cause a kernel oops since it needs to use the percpu counters the ext4 superblock structure. The fix is to skip setting the ext4 free block and free inode fields if the percpu counter has not been set. Thanks to Ken Sumrall for reporting and analyzing the root causes of this bug. Addresses-Google-Bug: #3054080 Signed-off-by: "Theodore Ts'o" --- include/linux/percpu_counter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 8a7d510ffa9c..46f6ba56fa91 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -78,6 +78,11 @@ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) return 1; } +static inline int percpu_counter_initialized(struct percpu_counter *fbc) +{ + return (fbc->counters != NULL); +} + #else struct percpu_counter { @@ -143,6 +148,11 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc) return percpu_counter_read(fbc); } +static inline int percpu_counter_initialized(struct percpu_counter *fbc) +{ + return 1; +} + #endif /* CONFIG_SMP */ static inline void percpu_counter_inc(struct percpu_counter *fbc) -- cgit v1.2.2