From c5c7b8ddfbf8cb3b2291e515a34ab1b8982f5a2d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 15 Jun 2014 23:46:28 -0400 Subject: ext4: Fix buffer double free in ext4_alloc_branch() Error recovery in ext4_alloc_branch() calls ext4_forget() even for buffer corresponding to indirect block it did not allocate. This leads to brelse() being called twice for that buffer (once from ext4_forget() and once from cleanup in ext4_ind_map_blocks()) leading to buffer use count misaccounting. Eventually (but often much later because there are other users of the buffer) we will see messages like: VFS: brelse: Trying to free free buffer Another manifestation of this problem is an error: JBD2 unexpected failure: jbd2_journal_revoke: !buffer_revoked(bh); inconsistent data on disk The fix is easy - don't forget buffer we did not allocate. Also add an explanatory comment because the indexing at ext4_alloc_branch() is somewhat subtle. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/indirect.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 8a57e9fcd1b9..f85bafd474dc 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return 0; failed: for (; i >= 0; i--) { - if (i != indirect_blks && branch[i].bh) + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) ext4_forget(handle, 1, inode, branch[i].bh, branch[i].bh->b_blocknr); ext4_free_blocks(handle, inode, NULL, new_blocks[i], -- cgit v1.2.2 From ccfb30001f37ace4690a74c27b4812cf054e123a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Jun 2014 13:02:11 +0900 Subject: f2fs: fix to report newly allocate region as extent Previous get_block in f2fs didn't report the newly allocated region which has NEW_ADDR. For reader, it should not report, but fiemap needs this. So, this patch introduces two get_block sharing core function. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0924521306b4..f8cf619edb5f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -608,8 +608,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; @@ -637,7 +637,7 @@ static int get_data_block(struct inode *inode, sector_t iblock, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; if (dn.data_blkaddr != NULL_ADDR) { @@ -671,7 +671,7 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -708,10 +708,23 @@ out: return err; } +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, get_data_block); + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); } static int f2fs_read_data_page(struct file *file, struct page *page) -- cgit v1.2.2 From ead432756ab2c76b1f1de742a1c8a06992cb98eb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Jun 2014 13:05:55 +0900 Subject: f2fs: recover fallocated data and its i_size together This patch arranges the f2fs_locks to cover the fallocated data and its i_size. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c58e33075719..f6c4bdaaae86 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -659,13 +659,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset, off_start = offset & (PAGE_CACHE_SIZE - 1); off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + f2fs_lock_op(sbi); + for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_reserve_block(&dn, index); - f2fs_unlock_op(sbi); if (ret) break; @@ -683,8 +683,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); - f2fs_write_inode(inode, NULL); + update_inode_page(inode); } + f2fs_unlock_op(sbi); return ret; } -- cgit v1.2.2 From 98397ff3cddcfdedd2ba1701bd30a73c1d733769 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Jun 2014 13:07:31 +0900 Subject: f2fs: fix not to allocate unnecessary blocks during fallocate This patch fixes the fallocate bug like below. (See xfstests/255) In fallocate(fd, 0, 20480), expand_inode_data processes for (index = pg_start; index <= pg_end; index++) { f2fs_reserve_block(); ... } So, even though fallocate requests 20480, 5 blocks, f2fs allocates 6 blocks including pg_end. So, this patch adds one condition to avoid block allocation. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f6c4bdaaae86..7d8b96275092 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -664,11 +664,14 @@ static int expand_inode_data(struct inode *inode, loff_t offset, for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; + if (index == pg_end && !off_end) + goto noalloc; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_reserve_block(&dn, index); if (ret) break; - +noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) -- cgit v1.2.2 From ec7756ae1517af483d995f386936d00a4cb1ab7d Mon Sep 17 00:00:00 2001 From: T Makphaibulchoke Date: Wed, 25 Jun 2014 22:08:29 -0400 Subject: fs/mbcache: replace __builtin_log2() with ilog2() Fix compiler error with some gcc version(s) that do not support __builtin_log2() by replacing __builtin_log2() with ilog2(). Signed-off-by: T. Makphaibulchoke Signed-off-by: Theodore Ts'o Reviewed-by: Maciej W. Rozycki --- fs/mbcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/mbcache.c b/fs/mbcache.c index bf166e388f0d..187477ded6b3 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -73,6 +73,7 @@ #include #include #include +#include #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -93,7 +94,7 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) -#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) #define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) -- cgit v1.2.2 From e43bb4e612b402a631bc549ac496f78bc7a79438 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 26 Jun 2014 10:11:53 -0400 Subject: ext4: decrement free clusters/inodes counters when block group declared bad We should decrement free clusters counter when block bitmap is marked as corrupt and free inodes counter when the allocation bitmap is marked as corrupt to avoid misunderstanding due to incorrect available size in statfs result. User can get immediately ENOSPC error from write begin without reaching for the writepages. Cc: Darrick J. Wong Reported-by: Amit Sahrawat Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan --- fs/ext4/balloc.c | 16 ++++++++++++++++ fs/ext4/ialloc.c | 23 +++++++++++++++++++++++ fs/ext4/mballoc.c | 8 ++++++++ 3 files changed, 47 insertions(+) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0762d143e252..fca382037ddd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -194,7 +194,16 @@ static void ext4_init_block_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -359,6 +368,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return; @@ -369,6 +379,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -376,6 +389,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0ee59a6644e2..a87455df38bc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, struct ext4_group_desc *gdp) { struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent @@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +196,12 @@ verify: ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } @@ -321,6 +338,12 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 59e31622cc6e..7f72f50a8fa7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; @@ -759,6 +760,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -1431,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1441,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); /* Mark the block group as corrupt. */ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &e4b->bd_info->bb_state); -- cgit v1.2.2 From 77ea2a4ba657a1ad4fb7c64bc5cdce84b8a132b6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Jun 2014 12:28:57 -0400 Subject: ext4: Fix block zeroing when punching holes in indirect block files free_holes_block() passed local variable as a block pointer to ext4_clear_blocks(). Thus ext4_clear_blocks() zeroed out this local variable instead of proper place in inode / indirect block. We later zero out proper place in inode / indirect block but don't dirty the inode / buffer again which can lead to subtle issues (some changes e.g. to inode can be lost). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/indirect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index f85bafd474dc..6f3bb55567b6 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1335,8 +1335,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, if (level == 0 || (bh && all_zeroes((__le32 *)bh->b_data, (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); - *i_data = 0; + ext4_free_data(handle, inode, parent_bh, + i_data, i_data + 1); } brelse(bh); bh = NULL; -- cgit v1.2.2 From a93cd4cf86466caa49cfe64607bea7f0bde3f916 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Jun 2014 12:30:54 -0400 Subject: ext4: Fix hole punching for files with indirect blocks Hole punching code for files with indirect blocks wrongly computed number of blocks which need to be cleared when traversing the indirect block tree. That could result in punching more blocks than actually requested and thus effectively cause a data loss. For example: fallocate -n -p 10240000 4096 will punch the range 10240000 - 12632064 instead of the range 1024000 - 10244096. Fix the calculation. CC: stable@vger.kernel.org Fixes: 8bad6fc813a3a5300f51369c39d315679fd88c72 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/indirect.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 6f3bb55567b6..fd69da194826 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1316,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; + ext4_lblk_t count2; + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } - first2 = (first > offset) ? first - offset : 0; + if (first > offset) { + first2 = first - offset; + count2 = count; + } else { + first2 = 0; + count2 = count - (offset - first); + } ret = free_hole_blocks(handle, inode, bh, (__le32 *)bh->b_data, level - 1, - first2, count - offset, + first2, count2, inode->i_sb->s_blocksize >> 2); if (ret) { brelse(bh); -- cgit v1.2.2 From 76f47128f9b33af1e96819746550d789054c9664 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 19 Jun 2014 16:44:48 -0400 Subject: nfsd: fix rare symlink decoding bug An NFS operation that creates a new symlink includes the symlink data, which is xdr-encoded as a length followed by the data plus 0 to 3 bytes of zero-padding as required to reach a 4-byte boundary. The vfs, on the other hand, wants null-terminated data. The simple way to handle this would be by copying the data into a newly allocated buffer with space for the final null. The current nfsd_symlink code tries to be more clever by skipping that step in the (likely) case where the byte following the string is already 0. But that assumes that the byte following the string is ours to look at. In fact, it might be the first byte of a page that we can't read, or of some object that another task might modify. Worse, the NFSv4 code tries to fix the problem by actually writing to that byte. In the NFSv2/v3 cases this actually appears to be safe: - nfs3svc_decode_symlinkargs explicitly null-terminates the data (after first checking its length and copying it to a new page). - NFSv2 limits symlinks to 1k. The buffer holding the rpc request is always at least a page, and the link data (and previous fields) have maximum lengths that prevent the request from reaching the end of a page. In the NFSv4 case the CREATE op is potentially just one part of a long compound so can end up on the end of a page if you're unlucky. The minimal fix here is to copy and null-terminate in the NFSv4 case. The nfsd_symlink() interface here seems too fragile, though. It should really either do the copy itself every time or just require a null-terminated string. Reported-by: Jeff Layton Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 9 --------- fs/nfsd/nfs4xdr.c | 13 ++++++++++++- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6851b003f2a4..8f029db5d271 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -617,15 +617,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (create->cr_type) { case NF4LNK: - /* ugh! we have to null-terminate the linktext, or - * vfs_symlink() will choke. it is always safe to - * null-terminate by brute force, since at worst we - * will overwrite the first byte of the create namelen - * in the XDR buffer, which has already been extracted - * during XDR decode. - */ - create->cr_linkname[create->cr_linklen] = 0; - status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, create->cr_linkname, create->cr_linklen, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 83baf2bfe9e9..5b4fef55676a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -600,7 +600,18 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create READ_BUF(4); create->cr_linklen = be32_to_cpup(p++); READ_BUF(create->cr_linklen); - SAVEMEM(create->cr_linkname, create->cr_linklen); + /* + * The VFS will want a null-terminated string, and + * null-terminating in place isn't safe since this might + * end on a page boundary: + */ + create->cr_linkname = + kmalloc(create->cr_linklen + 1, GFP_KERNEL); + if (!create->cr_linkname) + return nfserr_jukebox; + memcpy(create->cr_linkname, p, create->cr_linklen); + create->cr_linkname[create->cr_linklen] = '\0'; + defer_free(argp, kfree, create->cr_linkname); break; case NF4BLK: case NF4CHR: -- cgit v1.2.2 From 9b4eaf43f4b0207b5d1ca8b8d22df88ea9e142fe Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:35:59 +0800 Subject: btrfs: rename add_device_membership to btrfs_kobj_add_device Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index df39458f1487..1395efbffa2d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -605,7 +605,7 @@ static void init_feature_attrs(void) } } -static int add_device_membership(struct btrfs_fs_info *fs_info) +static int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -666,7 +666,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = add_device_membership(fs_info); + error = btrfs_kobj_add_device(fs_info); if (error) goto failure; -- cgit v1.2.2 From 99994cde9c59c2b8bb67d46d531b26cc73e39747 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:36:00 +0800 Subject: btrfs: dev delete should remove sysfs entry when we delete the device from the mounted btrfs, we would need its corresponding sysfs enty to be removed as well. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 20 ++++++++++++++++++++ fs/btrfs/sysfs.h | 2 ++ fs/btrfs/volumes.c | 4 ++++ 3 files changed, 26 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1395efbffa2d..401677b11045 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -605,6 +605,26 @@ static void init_feature_attrs(void) } } +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + static int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info) { int error = 0; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9ab576318a84..529554e5f9de 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,4 +66,6 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c83b24251e53..be2c8e285afb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,7 @@ #include "rcu-string.h" #include "math.h" #include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1680,6 +1681,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + call_rcu(&device->rcu, free_device); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; -- cgit v1.2.2 From 0d39376aa28eba6d63d0120ccc399735842abc8e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:36:01 +0800 Subject: btrfs: dev add should add its sysfs entry we would need the device links to be created, when device is added. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 12 +++++++++--- fs/btrfs/sysfs.h | 2 ++ fs/btrfs/volumes.c | 5 +++++ 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 401677b11045..78699364f537 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -625,14 +625,17 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, return 0; } -static int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info) +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *dev; - fs_info->device_dir_kobj = kobject_create_and_add("devices", + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) return -ENOMEM; @@ -643,6 +646,9 @@ static int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info) if (!dev->bdev) continue; + if (one_device && one_device != dev) + continue; + disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; @@ -686,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = btrfs_kobj_add_device(fs_info); + error = btrfs_kobj_add_device(fs_info, NULL); if (error) goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 529554e5f9de..ac46df37504c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,6 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index be2c8e285afb..19b67e969b52 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2147,6 +2147,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { @@ -2209,6 +2213,7 @@ error_trans: unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); -- cgit v1.2.2 From 49c6f736f34f901117c20960ebd7d5e60f12fcac Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:36:02 +0800 Subject: btrfs: dev replace should replace the sysfs entry when we replace the device its corresponding sysfs entry has to be replaced as well Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2af6e66fe788..eea26e1b2fda 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); @@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); -- cgit v1.2.2 From b2373f255cacdc1ea4da25e75a5a78949ffd9d66 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:36:03 +0800 Subject: btrfs: create sprout should rename fsid on the sysfs as well Creating sprout will change the fsid of the mounted root. do the same on the sysfs as well. reproducer: mount /dev/sdb /btrfs (seed disk) btrfs dev add /dev/sdc /btrfs mount -o rw,remount /btrfs btrfs dev del /dev/sdb /btrfs mount /dev/sdb /btrfs Error: kobject_add_internal failed for fe350492-dc28-4051-a601-e017b17e6145 with -EEXIST, don't try to register things with the same name in the same directory. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19b67e969b52..6ca0d9cc46f9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2154,6 +2154,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2164,6 +2165,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_abort_transaction(trans, root, ret); goto error_trans; } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); if (ret) { -- cgit v1.2.2 From 5588383ece6127909df5b9d601d562fe5b9fe38a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 24 Jun 2014 15:39:16 +0800 Subject: Btrfs: fix crash when mounting raid5 btrfs with missing disks The reproducer is $ mkfs.btrfs D1 D2 D3 -mraid5 $ mkfs.ext4 D2 && mkfs.ext4 D3 $ mount D1 /btrfs -odegraded ------------------- [ 87.672992] ------------[ cut here ]------------ [ 87.673845] kernel BUG at fs/btrfs/raid56.c:1828! ... [ 87.673845] RIP: 0010:[] [] __raid_recover_end_io+0x4ae/0x4d0 ... [ 87.673845] Call Trace: [ 87.673845] [] ? mempool_free+0x36/0xa0 [ 87.673845] [] raid_recover_end_io+0x75/0xa0 [ 87.673845] [] bio_endio+0x5b/0xa0 [ 87.673845] [] bio_endio_nodec+0x12/0x20 [ 87.673845] [] end_workqueue_fn+0x41/0x50 [ 87.673845] [] normal_work_helper+0xca/0x2c0 [ 87.673845] [] process_one_work+0x1eb/0x530 [ 87.673845] [] ? process_one_work+0x189/0x530 [ 87.673845] [] worker_thread+0x11b/0x4f0 [ 87.673845] [] ? rescuer_thread+0x290/0x290 [ 87.673845] [] kthread+0xe4/0x100 [ 87.673845] [] ? kthread_create_on_node+0x220/0x220 [ 87.673845] [] ret_from_fork+0x7c/0xb0 [ 87.673845] [] ? kthread_create_on_node+0x220/0x220 ------------------- It's because that we miscalculate @rbio->bbio->error so that it doesn't reach maximum of tolerable errors while it should have. Signed-off-by: Liu Bo Tested-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/raid56.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4055291a523e..4a88f073fdd7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { - if (rbio->faila == stripe || - rbio->failb == stripe) + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); continue; + } for (pagenr = 0; pagenr < nr_pages; pagenr++) { struct page *p; -- cgit v1.2.2 From c39aa7056f50fa7b49265ea35c0f3eddb7c7be68 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 25 Jun 2014 00:00:44 +0900 Subject: btrfs compression: reuse recently used workspace Add compression `workspace' in free_workspace() to `idle_workspace' list head, instead of tail. So we have better chances to reuse most recently used `workspace'. Signed-off-by: Sergey Senozhatsky Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 92371c414228..1daea0b47187 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace) spin_lock(workspace_lock); if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); + list_add(workspace, idle_workspace); (*num_workspace)++; spin_unlock(workspace_lock); goto wake; -- cgit v1.2.2 From 46c4e71e9b02a649c722f06569f5b6575da02dba Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Jun 2014 17:48:28 +0100 Subject: Btrfs: assert send doesn't attempt to start transactions When starting a transaction just assert that current->journal_info doesn't contain a send transaction stub, since send isn't supposed to start transactions and when it finishes (either successfully or not) it's supposed to set current->journal_info to NULL. This is motivated by the change titled: Btrfs: fix crash when starting transaction Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 511839c04f11..a32ad65b6cdc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -386,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, bool reloc_reserved = false; int ret; + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info && - current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { + if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; -- cgit v1.2.2 From 472b909ff6f4884d235ef7b9d3847fad5efafbff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 Jun 2014 13:45:41 -0700 Subject: btrfs: only unlock block in verify_parent_transid if we locked it This is a regression from my patch a26e8c9f75b0bfd8cccc9e8f110737b136eb5994, we need to only unlock the block if we were the one who locked it. Otherwise this will trip BUG_ON()'s in locking.c Thanks, cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8bb4aa19898f..f00165de6fbf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -369,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); - btrfs_tree_read_unlock_blocking(eb); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); return ret; } -- cgit v1.2.2 From 4e26445faad366d67d7723622bf6a60a6f0f5993 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 30 Jun 2014 11:50:28 +0800 Subject: kernfs: introduce kernfs_pin_sb() kernfs_pin_sb() tries to get a refcnt of the superblock. This will be used by cgroupfs. v2: - make kernfs_pin_sb() return the superblock. - drop kernfs_drop_sb(). tj: Updated the comment a bit. [ This is a prerequisite for a bugfix. ] Cc: # 3.15 Acked-by: Greg Kroah-Hartman Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- fs/kernfs/mount.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'fs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d171b98a6cdd..f973ae9b05f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb) kernfs_put(root_kn); } +/** + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root + * @kernfs_root: the kernfs_root in question + * @ns: the namespace tag + * + * Pin the superblock so the superblock won't be destroyed in subsequent + * operations. This can be used to block ->kill_sb() which may be useful + * for kernfs users which dynamically manage superblocks. + * + * Returns NULL if there's no superblock associated to this kernfs_root, or + * -EINVAL if the superblock is being freed. + */ +struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) +{ + struct kernfs_super_info *info; + struct super_block *sb = NULL; + + mutex_lock(&kernfs_mutex); + list_for_each_entry(info, &root->supers, node) { + if (info->ns == ns) { + sb = info->sb; + if (!atomic_inc_not_zero(&info->sb->s_active)) + sb = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&kernfs_mutex); + return sb; +} + void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", -- cgit v1.2.2 From ecca47ce8294843045e7465d76fee84dbf07a004 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Jul 2014 16:41:03 -0400 Subject: kernfs: kernfs_notify() must be useable from non-sleepable contexts d911d9874801 ("kernfs: make kernfs_notify() trigger inotify events too") added fsnotify triggering to kernfs_notify() which requires a sleepable context. There are already existing users of kernfs_notify() which invoke it from an atomic context and in general it's silly to require a sleepable context for triggering a notification. The following is an invalid context bug triggerd by md invoking sysfs_notify() from IO completion path. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1 2 locks held by swapper/1/0: #0: (&(&vblk->vq_lock)->rlock){-.-...}, at: [] virtblk_done+0x42/0xe0 [virtio_blk] #1: (&(&bitmap->counts.lock)->rlock){-.....}, at: [] bitmap_endwrite+0x68/0x240 irq event stamp: 33518 hardirqs last enabled at (33515): [] default_idle+0x1f/0x230 hardirqs last disabled at (33516): [] common_interrupt+0x6d/0x72 softirqs last enabled at (33518): [] _local_bh_enable+0x22/0x50 softirqs last disabled at (33517): [] irq_enter+0x60/0x80 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.16.0-0.rc2.git2.1.fc21.x86_64 #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000000 f90db13964f4ee05 ffff88007d403b80 ffffffff81807b4c 0000000000000000 ffff88007d403ba8 ffffffff810d4f14 0000000000000000 0000000000441800 ffff880078fa1780 ffff88007d403c38 ffffffff8180caf2 Call Trace: [] dump_stack+0x4d/0x66 [] __might_sleep+0x184/0x240 [] mutex_lock_nested+0x42/0x440 [] kernfs_notify+0x90/0x150 [] bitmap_endwrite+0xcc/0x240 [] close_write+0x93/0xb0 [raid1] [] r1_bio_write_done+0x29/0x50 [raid1] [] raid1_end_write_request+0xe4/0x260 [raid1] [] bio_endio+0x6b/0xa0 [] blk_update_request+0x94/0x420 [] blk_mq_end_io+0x1a/0x70 [] virtblk_request_done+0x32/0x80 [virtio_blk] [] __blk_mq_complete_request+0x88/0x120 [] blk_mq_complete_request+0x2a/0x30 [] virtblk_done+0x66/0xe0 [virtio_blk] [] vring_interrupt+0x3a/0xa0 [virtio_ring] [] handle_irq_event_percpu+0x77/0x340 [] handle_irq_event+0x3d/0x60 [] handle_edge_irq+0x66/0x130 [] handle_irq+0x84/0x150 [] do_IRQ+0x4d/0xe0 [] common_interrupt+0x72/0x72 [] ? native_safe_halt+0x6/0x10 [] default_idle+0x24/0x230 [] arch_cpu_idle+0xf/0x20 [] cpu_startup_entry+0x37c/0x7b0 [] start_secondary+0x25b/0x300 This patch fixes it by punting the notification delivery through a work item. This ends up adding an extra pointer to kernfs_elem_attr enlarging kernfs_node by a pointer, which is not ideal but not a very big deal either. If this turns out to be an actual issue, we can move kernfs_elem_attr->size to kernfs_node->iattr later. Signed-off-by: Tejun Heo Reported-by: Josh Boyer Cc: Jens Axboe Reviewed-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 69 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e3d37f607f97..d895b4b7b661 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -39,6 +39,19 @@ struct kernfs_open_node { struct list_head files; /* goes through kernfs_open_file.list */ }; +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; @@ -783,24 +796,25 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) return DEFAULT_POLLMASK|POLLERR|POLLPRI; } -/** - * kernfs_notify - notify a kernfs file - * @kn: file to notify - * - * Notify @kn such that poll(2) on @kn wakes up. - */ -void kernfs_notify(struct kernfs_node *kn) +static void kernfs_notify_workfn(struct work_struct *work) { - struct kernfs_root *root = kernfs_root(kn); + struct kernfs_node *kn; struct kernfs_open_node *on; struct kernfs_super_info *info; - unsigned long flags; - - if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); /* kick poll */ - spin_lock_irqsave(&kernfs_open_node_lock, flags); + spin_lock_irq(&kernfs_open_node_lock); on = kn->attr.open; if (on) { @@ -808,12 +822,12 @@ void kernfs_notify(struct kernfs_node *kn) wake_up_interruptible(&on->poll); } - spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + spin_unlock_irq(&kernfs_open_node_lock); /* kick fsnotify */ mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct inode *inode; struct dentry *dentry; @@ -833,6 +847,33 @@ void kernfs_notify(struct kernfs_node *kn) } mutex_unlock(&kernfs_mutex); + kernfs_put(kn); + goto repeat; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); + unsigned long flags; + + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); + } + spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); -- cgit v1.2.2 From 69bbd9c7b99974f3a701d4de6ef7010c37182a47 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Jun 2014 17:23:12 +0300 Subject: nfs: fix nfs4d readlink truncated packet XDR requires 4-byte alignment; nfs4d READLINK reply writes out the padding, but truncates the packet to the padding-less size. Fix by taking the padding into consideration when truncating the packet. Symptoms: # ll /mnt/ ls: cannot read symbolic link /mnt/test: Input/output error total 4 -rw-r--r--. 1 root root 0 Jun 14 01:21 123456 lrwxrwxrwx. 1 root root 6 Jul 2 03:33 test drwxr-xr-x. 1 root root 0 Jul 2 23:50 tmp drwxr-xr-x. 1 root root 60 Jul 2 23:44 tree Signed-off-by: Avi Kivity Fixes: 476a7b1f4b2c (nfsd4: don't treat readlink like a zero-copy operation) Reviewed-by: Kinglong Mee Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5b4fef55676a..2fc7abebeb9b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3278,7 +3278,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd wire_count = htonl(maxcount); write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); - xdr_truncate_encode(xdr, length_offset + 4 + maxcount); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); if (maxcount & 3) write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero, 4 - (maxcount&3)); -- cgit v1.2.2 From 3cc79392558f1789e5e1d2fce44b681980f403c3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Jun 2014 22:36:02 +0100 Subject: Btrfs: atomically set inode->i_flags in btrfs_update_iflags This change is based on the corresponding recent change for ext4: ext4: atomically set inode->i_flags in ext4_set_inode_flags() That has the following commit message that applies to btrfs as well: "Use cmpxchg() to atomically set i_flags instead of clearing out the S_IMMUTABLE, S_APPEND, etc. flags and then setting them from the EXT4_IMMUTABLE_FL, EXT4_APPEND_FL flags, since this opens up a race where an immutable file has the immutable flag cleared for a brief window of time." Replacing EXT4_IMMUTABLE_FL and EXT4_APPEND_FL with BTRFS_INODE_IMMUTABLE and BTRFS_INODE_APPEND, respectively. Reviewed-by: David Sterba Reviewed-by: Satoru Takeuchi Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6ea15469c63f..02dc64bbf1cb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* -- cgit v1.2.2 From 5f3164813b90f7dbcb5c3ab9006906222ce471b7 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 26 Jun 2014 11:08:16 +0800 Subject: Btrfs: fix race between balance recovery and root deletion Balance recovery is called when RW mounting or remounting from RO to RW, it is called to finish roots merging. When doing balance recovery, relocation root's corresponding fs root(whose root refs is 0) might be destroyed by cleaner thread, this will make btrfs fail to mount. Fix this problem by holding @cleaner_mutex when doing balance recovery. Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 ++ fs/btrfs/super.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f00165de6fbf..08e65e9cf2aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2905,7 +2905,9 @@ retry_root_backup: if (ret) goto fail_qgroup; + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4662d92a4b73..b6ebde231de7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1467,7 +1467,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret) goto restore; -- cgit v1.2.2 From 2aa06a35d06a34b3109bdbf1d653de1695dc8f12 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jun 2014 16:50:31 -0500 Subject: btrfs: fix nossd and ssd_spread mount option regression The commit 0780253 btrfs: Cleanup the btrfs_parse_options for remount. broke ssd options quite badly; it stopped making ssd_spread imply ssd, and it made "nossd" unsettable. Put things back at least as well as they were before (though ssd mount option handling is still pretty odd: # mount -o "nossd,ssd_spread" works?) Reported-by: Roman Mamedov Signed-off-by: Eric Sandeen Signed-off-by: Chris Mason --- fs/btrfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b6ebde231de7..0927e463afca 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_ssd_spread: btrfs_set_and_info(root, SSD_SPREAD, "use spread ssd allocation scheme"); + btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_clear_and_info(root, NOSSD, + btrfs_set_and_info(root, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; -- cgit v1.2.2 From e755f780865221252ef3321215c9796b78e7b1c5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 30 Jun 2014 17:12:47 +0800 Subject: btrfs: fix null pointer dereference in clone_fs_devices when name is null when one of the device path is missing btrfs_device name is null. So this patch will check for that. stack: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] strlen+0x0/0x30 [] ? clone_fs_devices+0xaa/0x160 [btrfs] [] btrfs_init_new_device+0x317/0xca0 [btrfs] [] ? __kmalloc_track_caller+0x15a/0x1a0 [] btrfs_ioctl+0xaa3/0x2860 [btrfs] [] ? handle_mm_fault+0x48c/0x9c0 [] ? __blkdev_put+0x171/0x180 [] ? __do_page_fault+0x4ac/0x590 [] ? blkdev_put+0x106/0x110 [] ? mntput+0x35/0x40 [] do_vfs_ioctl+0x460/0x4a0 [] ? ____fput+0xe/0x10 [] ? task_work_run+0xb3/0xd0 [] SyS_ioctl+0x57/0x90 [] ? do_page_fault+0xe/0x10 [] system_call_fastpath+0x16/0x1b reproducer: mkfs.btrfs -draid1 -mraid1 /dev/sdg1 /dev/sdg2 btrfstune -S 1 /dev/sdg1 modprobe -r btrfs && modprobe btrfs mount -o degraded /dev/sdg1 /btrfs btrfs dev add /dev/sdg3 /btrfs Signed-off-by: Anand Jain Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6ca0d9cc46f9..6104676857f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -555,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * This is ok to do without rcu read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); - if (!name) { - kfree(device); - goto error; + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); } - rcu_assign_pointer(device->name, name); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; -- cgit v1.2.2 From 0aeb8a6e67cddeac1d42cf64795fde0641a1cffb Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 30 Jun 2014 17:12:48 +0800 Subject: btrfs: fix null pointer dereference in btrfs_show_devname when name is null dev->name is null but missing flag is not set. Strictly speaking the missing flag should have been set, but there are more places where code just checks if name is null. For now this patch does the same. stack: BUG: unable to handle kernel NULL pointer dereference at 0000000000000064 IP: [] btrfs_show_devname+0x58/0xf0 [btrfs] [] show_vfsmnt+0x39/0x130 [] m_show+0x16/0x20 [] seq_read+0x296/0x390 [] vfs_read+0x9d/0x160 [] SyS_read+0x49/0x90 [] system_call_fastpath+0x16/0x1b reproducer: mkfs.btrfs -draid1 -mraid1 /dev/sdg1 /dev/sdg2 btrfstune -S 1 /dev/sdg1 modprobe -r btrfs && modprobe btrfs mount -o degraded /dev/sdg1 /btrfs btrfs dev add /dev/sdg3 /btrfs Signed-off-by: Anand Jain Signed-off-by: Chris Mason --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0927e463afca..8e16bca69c56 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1811,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; + if (!dev->name) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } -- cgit v1.2.2 From 14f5979633a67de81b9bd4a36a0eb99125728f9b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 29 Jun 2014 21:45:40 +0100 Subject: Btrfs: fix use-after-free when cloning a trailing file hole The transaction handle was being used after being freed. Cc: Chris Mason Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 02dc64bbf1cb..2a99f4987bb1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3142,7 +3142,6 @@ out: static void clone_update_extent_map(struct inode *inode, const struct btrfs_trans_handle *trans, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, const u64 hole_offset, const u64 hole_len) { @@ -3157,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode, return; } - if (fi) { + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); btrfs_extent_item_to_extent_map(inode, path, fi, false, em); em->generation = -1; if (btrfs_file_extent_type(path->nodes[0], fi) == @@ -3511,18 +3514,15 @@ process_slot: btrfs_item_ptr_offset(leaf, slot), size); inode_add_bytes(inode, datal); - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); } /* If we have an implicit hole (NO_HOLES feature). */ if (drop_start < new_key.offset) clone_update_extent_map(inode, trans, - path, NULL, drop_start, + NULL, drop_start, new_key.offset - drop_start); - clone_update_extent_map(inode, trans, path, - extent, 0, 0); + clone_update_extent_map(inode, trans, path, 0, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -3565,12 +3565,10 @@ process_slot: btrfs_end_transaction(trans, root); goto out; } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen); - if (ret) - goto out; - clone_update_extent_map(inode, trans, path, NULL, last_dest_end, - destoff + len - last_dest_end); } out: -- cgit v1.2.2 From 0a4eaea892a479aeebccde65986b27cfb6e33a78 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 20 Jun 2014 11:31:44 +0200 Subject: btrfs: remove stale comment from btrfs_flush_all_pending_stuffs Commit fcebe4562dec83b3f8d3088d77584727b09130b2 (Btrfs: rework qgroup accounting) removed the qgroup accounting after delayed refs. Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a32ad65b6cdc..98e76a333596 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1617,11 +1617,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_items(trans, root); - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ if (ret) return ret; -- cgit v1.2.2 From 130d5b415a091e493ac1508b9d27bbb85ba7b8c0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 20 Jun 2014 11:43:20 +0200 Subject: btrfs: use E2BIG instead of EIO if compression does not help Return codes got updated in 60e1975acb48fc3d74a3422b21dde74c977ac3d5 (btrfs: return errno instead of -1 from compression) lzo wrapper returns E2BIG in this case, do the same for zlib. Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 4f196314c0c1..b67d8fc81277 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -EIO; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this -- cgit v1.2.2 From d288db5dc0110c8e0732d099aaf7a05e2ea0e0c8 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 2 Jul 2014 16:58:01 +0800 Subject: Btrfs: fix race of using total_bytes_pinned This percpu counter @total_bytes_pinned is introduced to skip unnecessary operations of 'commit transaction', it accounts for those space we may free but are stuck in delayed refs. And we zero out @space_info->total_bytes_pinned every transaction period so we have a better idea of how much space we'll actually free up by committing this transaction. However, we do the 'zero out' part a little earlier, before we actually unpin space, so we end up returning ENOSPC when we actually have free space that's just unpinned from committing transaction. xfstests/generic/074 complained then. This fixes it by actually accounting the percpu pinned number when 'unpin', and since it's protected by space_info->lock, the race is gone now. Signed-off-by: Liu Bo Reviewed-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99c253918208..813537f362f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5678,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; down_write(&fs_info->commit_root_sem); @@ -5701,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->commit_root_sem); - list_for_each_entry_rcu(space_info, &fs_info->space_info, list) - percpu_counter_set(&space_info->total_bytes_pinned, 0); - update_global_block_rsv(fs_info); } @@ -5741,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; -- cgit v1.2.2 From be2c765dff9e5584965f78853c2addd2bb926946 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 Jul 2014 10:20:48 -0700 Subject: Btrfs: fix btrfs_print_leaf for skinny metadata We wouldn't actuall print the extent information if we had a skinny metadata item, this fixes that. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/print-tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6efd70d3b64f..9626b4ad3b9a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot) btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); @@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); -- cgit v1.2.2 From abdd2e80a57e5f7278f47913315065f0a3d78d20 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Jun 2014 17:46:58 +0100 Subject: Btrfs: fix crash when starting transaction Often when starting a transaction we commit the currently running transaction, which can end up writing block group caches when the current process has its journal_info set to NULL (and not to a transaction). This makes our assertion at btrfs_check_data_free_space() (current_journal != NULL) fail, resulting in a crash/hang. Therefore fix it by setting journal_info. Two different traces of this issue follow below. 1) [51502.241936] BTRFS: assertion failed: current->journal_info, file: fs/btrfs/extent-tree.c, line: 3670 [51502.242213] ------------[ cut here ]------------ [51502.242493] kernel BUG at fs/btrfs/ctree.h:3964! [51502.242669] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC (...) [51502.244010] Call Trace: [51502.244010] [] btrfs_check_data_free_space+0x395/0x3a0 [btrfs] [51502.244010] [] btrfs_write_dirty_block_groups+0x4ac/0x640 [btrfs] [51502.244010] [] commit_cowonly_roots+0x164/0x226 [btrfs] [51502.244010] [] btrfs_commit_transaction+0x4ed/0xab0 [btrfs] [51502.244010] [] ? _raw_spin_unlock+0x2b/0x40 [51502.244010] [] start_transaction+0x459/0x620 [btrfs] [51502.244010] [] btrfs_start_transaction+0x1b/0x20 [btrfs] [51502.244010] [] __unlink_start_trans+0x31/0xe0 [btrfs] [51502.244010] [] btrfs_unlink+0x37/0xc0 [btrfs] [51502.244010] [] ? do_unlinkat+0x114/0x2a0 [51502.244010] [] vfs_unlink+0xcc/0x150 [51502.244010] [] do_unlinkat+0x260/0x2a0 [51502.244010] [] ? filp_close+0x64/0x90 [51502.244010] [] ? trace_hardirqs_on_caller+0x16/0x1e0 [51502.244010] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [51502.244010] [] SyS_unlinkat+0x1b/0x40 [51502.244010] [] system_call_fastpath+0x16/0x1b [51502.244010] Code: 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 89 f1 48 c7 c2 71 13 36 a0 48 89 fe 31 c0 48 c7 c7 b8 43 36 a0 48 89 e5 e8 5d b0 32 e1 <0f> 0b 0f 1f 44 00 00 55 b9 11 00 00 00 48 89 e5 41 55 49 89 f5 [51502.244010] RIP [] assfail.constprop.88+0x1e/0x20 [btrfs] 2) [25405.097230] BTRFS: assertion failed: current->journal_info, file: fs/btrfs/extent-tree.c, line: 3670 [25405.097488] ------------[ cut here ]------------ [25405.097767] kernel BUG at fs/btrfs/ctree.h:3964! [25405.097940] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC (...) [25405.100008] Call Trace: [25405.100008] [] btrfs_check_data_free_space+0x395/0x3a0 [btrfs] [25405.100008] [] btrfs_write_dirty_block_groups+0x4ac/0x640 [btrfs] [25405.100008] [] commit_cowonly_roots+0x164/0x226 [btrfs] [25405.100008] [] btrfs_commit_transaction+0x4ed/0xab0 [btrfs] [25405.100008] [] ? bit_waitqueue+0xc0/0xc0 [25405.100008] [] start_transaction+0x459/0x620 [btrfs] [25405.100008] [] btrfs_start_transaction+0x1b/0x20 [btrfs] [25405.100008] [] btrfs_create+0x47/0x210 [btrfs] [25405.100008] [] ? btrfs_permission+0x3c/0x80 [btrfs] [25405.100008] [] vfs_create+0x9b/0x130 [25405.100008] [] do_last+0x849/0xe20 [25405.100008] [] ? link_path_walk+0x79/0x820 [25405.100008] [] path_openat+0xc5/0x690 [25405.100008] [] ? trace_hardirqs_on+0xd/0x10 [25405.100008] [] ? __alloc_fd+0x32/0x1d0 [25405.100008] [] do_filp_open+0x43/0xa0 [25405.100008] [] ? __alloc_fd+0x151/0x1d0 [25405.100008] [] do_sys_open+0x13c/0x230 [25405.100008] [] ? trace_hardirqs_on_caller+0x16/0x1e0 [25405.100008] [] SyS_open+0x22/0x30 [25405.100008] [] system_call_fastpath+0x16/0x1b [25405.100008] Code: 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 89 f1 48 c7 c2 51 13 36 a0 48 89 fe 31 c0 48 c7 c7 d0 43 36 a0 48 89 e5 e8 6d b5 32 e1 <0f> 0b 0f 1f 44 00 00 55 b9 11 00 00 00 48 89 e5 41 55 49 89 f5 [25405.100008] RIP [] assfail.constprop.88+0x1e/0x20 [btrfs] Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 98e76a333596..5f379affdf23 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -493,6 +493,7 @@ again: smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && may_wait_transaction(root, type)) { + current->journal_info = h; btrfs_commit_transaction(h, root); goto again; } -- cgit v1.2.2 From 571ff4731bc9e5811080f7d16d24c3af7cbca142 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 2 Jul 2014 15:22:35 -0700 Subject: autofs4: fix false positive compile error On strict build environments we can see: fs/autofs4/inode.c: In function 'autofs4_fill_super': fs/autofs4/inode.c:312: error: 'pgrp' may be used uninitialized in this function make[2]: *** [fs/autofs4/inode.o] Error 1 make[1]: *** [fs/autofs4] Error 2 make: *** [fs] Error 2 make: *** Waiting for unfinished jobs.... This is due to the use of pgrp_set being used to indicate pgrp has has been set rather than initializing pgrp itself. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d7bd395ab586..1c55388ae633 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -210,7 +210,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; - int pgrp; + int pgrp = 0; bool pgrp_set = false; int ret = -EINVAL; -- cgit v1.2.2 From f74373a5cc7a0155d232c4e999648c7a95435bb2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Jul 2014 15:22:37 -0700 Subject: /proc/stat: convert to single_open_size() These two patches are supposed to "fix" failed order-4 memory allocations which have been observed when reading /proc/stat. The problem has been observed on s390 as well as on x86. To address the problem change the seq_file memory allocations to fallback to use vmalloc, so that allocations also work if memory is fragmented. This approach seems to be simpler and less intrusive than changing /proc/stat to use an interator. Also it "fixes" other users as well, which use seq_file's single_open() interface. This patch (of 2): Use seq_file's single_open_size() to preallocate a buffer that is large enough to hold the whole output, instead of open coding it. Also calculate the requested size using the number of online cpus instead of possible cpus, since the size of the output only depends on the number of online cpus. Signed-off-by: Heiko Carstens Acked-by: David Rientjes Cc: Ian Kent Cc: Hendrik Brueckner Cc: Thorsten Diehl Cc: Andrea Righi Cc: Christoph Hellwig Cc: Al Viro Cc: Stefan Bader Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9d231e9e5f0e..bf2d03f8fd3e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_possible_cpus(); - char *buf; - struct seq_file *m; - int res; + size_t size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { -- cgit v1.2.2 From 058504edd02667eef8fac9be27ab3ea74332e9b4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Jul 2014 15:22:37 -0700 Subject: fs/seq_file: fallback to vmalloc allocation There are a couple of seq_files which use the single_open() interface. This interface requires that the whole output must fit into a single buffer. E.g. for /proc/stat allocation failures have been observed because an order-4 memory allocation failed due to memory fragmentation. In such situations reading /proc/stat is not possible anymore. Therefore change the seq_file code to fallback to vmalloc allocations which will usually result in a couple of order-0 allocations and hence also work if memory is fragmented. For reference a call trace where reading from /proc/stat failed: sadc: page allocation failure: order:4, mode:0x1040d0 CPU: 1 PID: 192063 Comm: sadc Not tainted 3.10.0-123.el7.s390x #1 [...] Call Trace: show_stack+0x6c/0xe8 warn_alloc_failed+0xd6/0x138 __alloc_pages_nodemask+0x9da/0xb68 __get_free_pages+0x2e/0x58 kmalloc_order_trace+0x44/0xc0 stat_open+0x5a/0xd8 proc_reg_open+0x8a/0x140 do_dentry_open+0x1bc/0x2c8 finish_open+0x46/0x60 do_last+0x382/0x10d0 path_openat+0xc8/0x4f8 do_filp_open+0x46/0xa8 do_sys_open+0x114/0x1f0 sysc_tracego+0x14/0x1a Signed-off-by: Heiko Carstens Tested-by: David Rientjes Cc: Ian Kent Cc: Hendrik Brueckner Cc: Thorsten Diehl Cc: Andrea Righi Cc: Christoph Hellwig Cc: Al Viro Cc: Stefan Bader Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 1d641bb108d2..3857b720cb1b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,8 +8,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } @@ -135,9 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -192,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } @@ -232,9 +244,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; m->version = 0; @@ -350,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kfree(m->buf); + kvfree(m->buf); kfree(m); return 0; } @@ -605,13 +617,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = kmalloc(size, GFP_KERNEL); + char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kfree(buf); + kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; -- cgit v1.2.2 From 61c219f5814277ecb71d64cb30297028d6665979 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 5 Jul 2014 16:28:35 -0400 Subject: ext4: fix unjournalled bg descriptor while initializing inode bitmap The first time that we allocate from an uninitialized inode allocation bitmap, if the block allocation bitmap is also uninitalized, we need to get write access to the block group descriptor before we start modifying the block group descriptor flags and updating the free block count, etc. Otherwise, there is the potential of a bad journal checksum (if journal checksums are enabled), and of the file system becoming inconsistent if we crash at exactly the wrong time. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index a87455df38bc..0840bf321cdb 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -874,6 +874,13 @@ got: goto out; } + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -910,13 +917,6 @@ got: } } - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) { - ext4_std_error(sb, err); - goto out; - } - /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; -- cgit v1.2.2 From ae0f78de2c43b6fadd007c231a352b13b5be8ed2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 5 Jul 2014 18:40:52 -0400 Subject: ext4: clarify error count warning messages Make it clear that values printed are times, and that it is error since last fsck. Also add note about fsck version required. Signed-off-by: Pavel Machek Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org --- fs/ext4/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b9b9aabfb4d2..342394772b6c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2809,10 +2809,11 @@ static void print_daily_error_info(unsigned long arg) es = sbi->s_es; if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, @@ -2826,7 +2827,7 @@ static void print_daily_error_info(unsigned long arg) printk("\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, -- cgit v1.2.2 From 94d4c066a4ff170a2671b1a9b153febbf36796f6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 5 Jul 2014 19:15:50 -0400 Subject: ext4: clarify ext4_error message in ext4_mb_generate_buddy_error() We are spending a lot of time explaining to users what this error means. Let's try to improve the message to avoid this problem. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7f72f50a8fa7..2dcb936be90e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -752,8 +752,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd; " - "block bitmap corrupt.", + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor -- cgit v1.2.2 From 5dd214248f94d430d70e9230bda72f2654ac88a8 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 5 Jul 2014 19:18:22 -0400 Subject: ext4: disable synchronous transaction batching if max_batch_time==0 The mount manpage says of the max_batch_time option, This optimization can be turned off entirely by setting max_batch_time to 0. But the code doesn't do that. So fix the code to do that. Signed-off-by: Eric Sandeen Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 2 -- fs/jbd2/transaction.c | 5 ++++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 342394772b6c..6297c07c937f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1525,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, arg = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * arg; } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; sbi->s_max_batch_time = arg; } else if (token == Opt_min_batch_time) { sbi->s_min_batch_time = arg; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 38cfcf5f6fce..6f0f590cc5a3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1588,9 +1588,12 @@ int jbd2_journal_stop(handle_t *handle) * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; -- cgit v1.2.2 From 126b9d4365b110c157bc4cbc32540dfa66c9c85a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Jul 2014 15:28:50 +0200 Subject: fuse: timeout comparison fix As suggested by checkpatch.pl, use time_before64() instead of direct comparison of jiffies64 values. Signed-off-by: Miklos Szeredi Cc: --- fs/fuse/dir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 42198359fa1b..225176203a8c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -198,7 +198,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (fuse_dentry_time(entry) < get_jiffies_64()) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64())) { int err; struct fuse_entry_out outarg; struct fuse_req *req; @@ -985,7 +985,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, int err; bool r; - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { r = true; err = fuse_do_getattr(inode, stat, file); } else { @@ -1171,7 +1171,7 @@ static int fuse_permission(struct inode *inode, int mask) ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); -- cgit v1.2.2 From 154210ccb3a871e631bf39fdeb7a8731d98af87b Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Thu, 26 Jun 2014 20:21:57 -0400 Subject: fuse: ignore entry-timeout on LOOKUP_REVAL The following test case demonstrates the bug: sh# mount -t glusterfs localhost:meta-test /mnt/one sh# mount -t glusterfs localhost:meta-test /mnt/two sh# echo stuff > /mnt/one/file; rm -f /mnt/two/file; echo stuff > /mnt/one/file bash: /mnt/one/file: Stale file handle sh# echo stuff > /mnt/one/file; rm -f /mnt/two/file; sleep 1; echo stuff > /mnt/one/file On the second open() on /mnt/one, FUSE would have used the old nodeid (file handle) trying to re-open it. Gluster is returning -ESTALE. The ESTALE propagates back to namei.c:filename_lookup() where lookup is re-attempted with LOOKUP_REVAL. The right behavior now, would be for FUSE to ignore the entry-timeout and and do the up-call revalidation. Instead FUSE is ignoring LOOKUP_REVAL, succeeding the revalidation (because entry-timeout has not passed), and open() is again retried on the old file handle and finally the ESTALE is going back to the application. Fix: if revalidation is happening with LOOKUP_REVAL, then ignore entry-timeout and always do the up-call. Signed-off-by: Anand Avati Reviewed-by: Niels de Vos Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 225176203a8c..202a9721be93 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -198,7 +198,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (time_before64(fuse_dentry_time(entry), get_jiffies_64())) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & LOOKUP_REVAL)) { int err; struct fuse_entry_out outarg; struct fuse_req *req; -- cgit v1.2.2 From 7b3d8bf7718a8de8ac6a25ed7332c1c129839400 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Thu, 26 Jun 2014 23:06:52 +0530 Subject: fuse: inode: drop cast This patch removes the cast on data of type void * as it is not needed. The following Coccinelle semantic patch was used for making the change: @r@ expression x; void* e; type T; identifier f; @@ ( *((T *)e) | ((T *)x)[...] | ((T *)x)->f | - (T *) e ) Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 754dcf23de8a..7b8fc7d33def 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1006,7 +1006,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); - if (!parse_fuse_opt((char *) data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev)) goto err; if (is_bdev) { -- cgit v1.2.2 From 233a01fa9c4c7c41238537e8db8434667ff28a2f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Jul 2014 15:28:51 +0200 Subject: fuse: handle large user and group ID If the number in "user_id=N" or "group_id=N" mount options was larger than INT_MAX then fuse returned EINVAL. Fix this to handle all valid uid/gid values. Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/inode.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7b8fc7d33def..8474028d7848 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -478,6 +478,17 @@ static const match_table_t tokens = { {OPT_ERR, NULL} }; +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ + int err = -ENOMEM; + char *buf = match_strdup(s); + if (buf) { + err = kstrtouint(buf, 10, res); + kfree(buf); + } + return err; +} + static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) { char *p; @@ -488,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) while ((p = strsep(&opt, ",")) != NULL) { int token; int value; + unsigned uv; substring_t args[MAX_OPT_ARGS]; if (!*p) continue; @@ -511,18 +523,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_USER_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), value); + d->user_id = make_kuid(current_user_ns(), uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), value); + d->group_id = make_kgid(current_user_ns(), uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; -- cgit v1.2.2 From c55a01d360afafcd52bc405c044a6ebf5de436d5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Jul 2014 15:28:51 +0200 Subject: fuse: avoid scheduling while atomic As reported by Richard Sharpe, an attempt to use fuse_notify_inval_entry() triggers complains about scheduling while atomic: BUG: scheduling while atomic: fuse.hf/13976/0x10000001 This happens because fuse_notify_inval_entry() attempts to allocate memory with GFP_KERNEL, holding "struct fuse_copy_state" mapped by kmap_atomic(). Introduced by commit 58bda1da4b3c "fuse/dev: use atomic maps" Fix by moving the map/unmap to just cover the actual memcpy operation. Original patch from Maxim Patlasov Reported-by: Richard Sharpe Signed-off-by: Miklos Szeredi Cc: # v3.15+ --- fs/fuse/dev.c | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index aac71ce373e4..75fa055012b2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -643,9 +643,8 @@ struct fuse_copy_state { unsigned long seglen; unsigned long addr; struct page *pg; - void *mapaddr; - void *buf; unsigned len; + unsigned offset; unsigned move_pages:1; }; @@ -666,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; - if (!cs->write) { - kunmap_atomic(cs->mapaddr); - } else { - kunmap_atomic(cs->mapaddr); + if (cs->write) buf->len = PAGE_SIZE - cs->len; - } cs->currbuf = NULL; - cs->mapaddr = NULL; - } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr); + } else if (cs->pg) { if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } put_page(cs->pg); - cs->mapaddr = NULL; } + cs->pg = NULL; } /* @@ -691,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) */ static int fuse_copy_fill(struct fuse_copy_state *cs) { - unsigned long offset; + struct page *page; int err; unlock_request(cs->fc, cs->req); @@ -706,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = kmap_atomic(buf->page); + cs->pg = buf->page; + cs->offset = buf->offset; cs->len = buf->len; - cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; cs->nr_segs--; } else { - struct page *page; - if (cs->nr_segs == cs->pipe->buffers) return -EIO; @@ -726,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page); - cs->buf = cs->mapaddr; + cs->pg = page; + cs->offset = 0; cs->len = PAGE_SIZE; cs->pipebufs++; cs->nr_segs++; @@ -740,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->iov++; cs->nr_segs--; } - err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + err = get_user_pages_fast(cs->addr, 1, cs->write, &page); if (err < 0) return err; BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->pg = page; + cs->offset = cs->addr % PAGE_SIZE; + cs->len = min(PAGE_SIZE - cs->offset, cs->seglen); cs->seglen -= cs->len; cs->addr += cs->len; } @@ -760,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { + void *pgaddr = kmap_atomic(cs->pg); + void *buf = pgaddr + cs->offset; + if (cs->write) - memcpy(cs->buf, *val, ncpy); + memcpy(buf, *val, ncpy); else - memcpy(*val, cs->buf, ncpy); + memcpy(*val, buf, ncpy); + + kunmap_atomic(pgaddr); *val += ncpy; } *size -= ncpy; cs->len -= ncpy; - cs->buf += ncpy; + cs->offset += ncpy; return ncpy; } @@ -874,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = kmap_atomic(buf->page); - cs->buf = cs->mapaddr + buf->offset; + cs->pg = buf->page; + cs->offset = buf->offset; err = lock_request(cs->fc, cs->req); if (err) -- cgit v1.2.2 From c3a4561796cffae6996264876ffca147b5c3709a Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sun, 6 Jul 2014 11:34:43 +0800 Subject: nfsd: Fix bad reserving space for encoding rdattr_error Introduced by commit 561f0ed498 (nfsd4: allow large readdirs). Signed-off-by: Kinglong Mee Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2fc7abebeb9b..b56b1cc02718 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2641,7 +2641,7 @@ nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { __be32 *p; - p = xdr_reserve_space(xdr, 6); + p = xdr_reserve_space(xdr, 20); if (!p) return NULL; *p++ = htonl(2); -- cgit v1.2.2 From 74adf83f5d7720925499b4938f930591f947b660 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Jun 2014 11:07:03 +0200 Subject: nfs: only show Posix ACLs in listxattr if actually present The big ACL switched nfs to use generic_listxattr, which calls all existing ->list handlers. Add a custom .listxattr implementation that only lists the ACLs if they actually are present on the given inode. Signed-off-by: Christoph Hellwig Reported-by: Philippe Troin Tested-by: Philippe Troin Fixes: 013cdf1088d7 (nfs: use generic posix ACL infrastructure ...) Cc: stable@vger.kernel.org # 3.14+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs3proc.c | 4 ++-- 2 files changed, 45 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 871d6eda8dba..8f854dde4150 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -247,3 +247,46 @@ const struct xattr_handler *nfs3_xattr_handlers[] = { &posix_acl_default_xattr_handler, NULL, }; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, + size_t size, ssize_t *result) +{ + struct posix_acl *acl; + char *p = data + *result; + + acl = get_acl(inode, type); + if (!acl) + return 0; + + posix_acl_release(acl); + + *result += strlen(name); + *result += 1; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct inode *inode = dentry->d_inode; + ssize_t result = 0; + int error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, + POSIX_ACL_XATTR_ACCESS, data, size, &result); + if (error) + return error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, + POSIX_ACL_XATTR_DEFAULT, data, size, &result); + if (error) + return error; + return result; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index e7daa42bbc86..f0afa291fd58 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -885,7 +885,7 @@ static const struct inode_operations nfs3_dir_inode_operations = { .getattr = nfs_getattr, .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL - .listxattr = generic_listxattr, + .listxattr = nfs3_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, @@ -899,7 +899,7 @@ static const struct inode_operations nfs3_file_inode_operations = { .getattr = nfs_getattr, .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL - .listxattr = generic_listxattr, + .listxattr = nfs3_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, -- cgit v1.2.2 From 8bc6f60e3f7f31c4ce370b4b27b8f4b355b7f07e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 11 Jun 2014 18:32:23 +0800 Subject: f2fs: remove unused variables in f2fs_sm_info Remove unused variables in struct f2fs_sm_info. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- fs/f2fs/segment.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e51c732b0dd9..7ef7acdb9350 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -342,9 +342,6 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ - block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f25f0e07e26f..b22d5a0652bb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1885,8 +1885,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); -- cgit v1.2.2 From d6b7d4b31dfd5a454a71c445b8086bc098237334 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 12 Jun 2014 13:23:41 +0800 Subject: f2fs: check lower bound nid value in check_nid_range This patch add lower bound verification for nid in check_nid_range, so nids reserved like 0, node, meta passed by caller could be checked there. And then check_nid_range could be used in f2fs_nfs_get_inode for simplifying code. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/inode.c | 1 + fs/f2fs/super.c | 4 +--- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7ef7acdb9350..58df97e174d0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -641,7 +641,8 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) */ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index adc622c6bdce..2cf6962f6cc8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -78,6 +78,7 @@ static int do_read_inode(struct inode *inode) if (check_nid_range(sbi, inode->i_ino)) { f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", (unsigned long) inode->i_ino); + WARN_ON(1); return -EINVAL; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b2b18637cb9e..8f96d9372ade 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -689,9 +689,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (unlikely(ino < F2FS_ROOT_INO(sbi))) - return ERR_PTR(-ESTALE); - if (unlikely(ino >= NM_I(sbi)->max_nid)) + if (check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* -- cgit v1.2.2 From 90d72459ccb47335a4348947506fd091e63f7cf8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Jun 2014 10:34:00 +0800 Subject: f2fs: fix error path in init_inode_metadata If we fail in this path: ->init_inode_metadata ->make_empty_dir ->get_new_data_page ->grab_cache_page return -ENOMEM We will bug on in error path of init_inode_metadata when call remove_inode_page because i_block = 2 (one inode block will be released later & one dentry block). We should release the dentry block in init_inode_metadata to avoid this BUG_ON, and avoid leak of dentry block resource, because we never have second chance to release that block in ->evict_inode as in upper error path we make this inode 'bad'. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 966acb039e3b..a4addd72ebbd 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -376,11 +376,11 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); +error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0); remove_dirty_dir_inode(inode); -error: remove_inode_page(inode); return ERR_PTR(err); } -- cgit v1.2.2 From dd4d961fe7d1cb1cba6e9d8d132475d4917ba864 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Jun 2014 14:13:13 +0800 Subject: f2fs: release new entry page correctly in error path of f2fs_rename This patch correct releasing code of new_page to avoid BUG_ON in error patch of f2fs_rename. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 9138c32aa698..aa3ddb228f98 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -474,7 +474,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: - f2fs_put_page(new_page, 1); + kunmap(new_page); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { kunmap(old_dir_page); -- cgit v1.2.2 From b2c0829912493df596706a1a036c67beb1bd6ff5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 30 Jun 2014 18:09:55 +0900 Subject: f2fs: do checkpoint for the renamed inode If an inode is renamed, it should be registered as file_lost_pino to conduct checkpoint at f2fs_sync_file. Otherwise, the inode cannot be recovered due to no dent_mark in the following scenario. Note that, this scenario is from xfstests/322. 1. create "a" 2. fsync "a" 3. rename "a" to "b" 4. fsync "b" 5. Sudden power-cut After recovery is done, "b" should be seen. However, the result shows "a", since the recovery procedure does not enter recover_dentry due to no dent_mark. The reason is like below. - The nid of "a" is checkpointed during #2, f2fs_sync_file. - The inode page for "b" produced by #3 is written without dent_mark by sync_node_pages. So, this patch fixes this bug by assinging file_lost_pino to the "a"'s inode. If the pino is lost, f2fs_sync_file conducts checkpoint, and then recovers the latest pino and its dentry information for further recovery. Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index aa3ddb228f98..a6bdddc33ce2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -417,9 +417,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); @@ -448,6 +445,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); @@ -457,9 +458,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); -- cgit v1.2.2 From 2743f865543c0c4a5e12fc13edb2bf89a6e9687c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 28 Jun 2014 01:00:41 +0900 Subject: f2fs: check bdi->dirty_exceeded when trying to skip data writes If we don't check the current backing device status, balance_dirty_pages can fall into infinite pausing routine. This can be occurred when a lot of directories make a small number of dirty dentry pages including files. Reported-by: Brian Chadwick Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9dfb9a042fd2..4b697ccc9b0c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -42,6 +42,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); } -- cgit v1.2.2 From 50e1f8d22199b557337b3d1ec8520e4c5aa5c76e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Jul 2014 09:39:32 +0800 Subject: f2fs: avoid to access NULL pointer in issue_flush_thread Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=75861 Denis 2014-05-10 11:28:59 UTC reported: "F2FS-fs (mmcblk0p28): mounting.. Unable to handle kernel NULL pointer dereference at virtual address 00000018 ... [] (_raw_spin_lock+0x3c/0x70) from [] (issue_flush_thread+0x50/0x17c) [] (issue_flush_thread+0x50/0x17c) from [] (kthread+0x98/0xa4) [] (kthread+0x98/0xa4) from [] (kernel_thread_exit+0x0/0x8)" This patch assign cmd_control_info in sm_info before issue_flush_thread is being created, so this make sure that issue flush thread will have no chance to access invalid info in fcc. Signed-off-by: Chao Yu Reviewed-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b22d5a0652bb..d04613df710a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,14 +272,15 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); + sbi->sm_info->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; return err; } - sbi->sm_info->cmd_control_info = fcc; return err; } -- cgit v1.2.2 From 4237ba43b65aa989674c89fc4f2fe46eebc501ee Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Jul 2014 10:50:19 +0200 Subject: fuse: restructure ->rename2() Make ->rename2() universal, i.e. able to handle zero flags. This is to make future change of the API easier. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 202a9721be93..0c6048247a34 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -815,13 +815,6 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) -{ - return fuse_rename_common(olddir, oldent, newdir, newent, 0, - FUSE_RENAME, sizeof(struct fuse_rename_in)); -} - static int fuse_rename2(struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) @@ -832,17 +825,30 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; - if (fc->no_rename2 || fc->minor < 23) - return -EINVAL; + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; - err = fuse_rename_common(olddir, oldent, newdir, newent, flags, - FUSE_RENAME2, sizeof(struct fuse_rename2_in)); - if (err == -ENOSYS) { - fc->no_rename2 = 1; - err = -EINVAL; + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); } + return err; +} +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename2(olddir, oldent, newdir, newent, 0); } static int fuse_link(struct dentry *entry, struct inode *newdir, -- cgit v1.2.2 From f9ae9cf5d72b3926ca48ea60e15bdbb840f42372 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 11 Jul 2014 13:55:40 -0400 Subject: ext4: revert commit which was causing fs corruption after journal replays Commit 007649375f6af2 ("ext4: initialize multi-block allocator before checking block descriptors") causes the block group descriptor's count of the number of free blocks to become inconsistent with the number of free blocks in the allocation bitmap. This is a harmless form of fs corruption, but it causes the kernel to potentially remount the file system read-only, or to panic, depending on the file systems's error behavior. Thanks to Eric Whitney for his tireless work to reproduce and to find the guilty commit. Fixes: 007649375f6af2 ("ext4: initialize multi-block allocator before checking block descriptors" Cc: stable@vger.kernel.org # 3.15 Reported-by: David Jander Reported-by: Matteo Croce Tested-by: Eric Whitney Suggested-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 51 ++++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6297c07c937f..6df7bc611dbd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3879,38 +3879,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - - /* - * set up enough so that it can read an inode, - * and create new inode for buddy allocator - */ - sbi->s_gdb_count = db_count; - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); - goto failed_mount2; - } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2a; + goto failed_mount2; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2a; + goto failed_mount2; } + sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3945,6 +3926,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; + /* + * set up enough so that it can read an inode + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4134,13 +4123,21 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount5; + goto failed_mount4a; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); + goto failed_mount4a; + } + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); goto failed_mount5; } @@ -4217,8 +4214,11 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_release_system_zone(sb); + ext4_mb_release(sb); failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4242,14 +4242,11 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); -failed_mount2a: - ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: - ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { -- cgit v1.2.2 From 3f1f9b851311a76226140b55b1ea22111234a7c2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Jul 2014 15:32:24 -0400 Subject: ext4: fix a potential deadlock in __ext4_es_shrink() This fixes the following lockdep complaint: [ INFO: possible circular locking dependency detected ] 3.16.0-rc2-mm1+ #7 Tainted: G O ------------------------------------------------------- kworker/u24:0/4356 is trying to acquire lock: (&(&sbi->s_es_lru_lock)->rlock){+.+.-.}, at: [] __ext4_es_shrink+0x4f/0x2e0 but task is already holding lock: (&ei->i_es_lock){++++-.}, at: [] ext4_es_insert_extent+0x71/0x180 which lock already depends on the new lock. Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&ei->i_es_lock); lock(&(&sbi->s_es_lru_lock)->rlock); lock(&ei->i_es_lock); lock(&(&sbi->s_es_lru_lock)->rlock); *** DEADLOCK *** 6 locks held by kworker/u24:0/4356: #0: ("writeback"){.+.+.+}, at: [] process_one_work+0x180/0x560 #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [] process_one_work+0x180/0x560 #2: (&type->s_umount_key#22){++++++}, at: [] grab_super_passive+0x44/0x90 #3: (jbd2_handle){+.+...}, at: [] start_this_handle+0x189/0x5f0 #4: (&ei->i_data_sem){++++..}, at: [] ext4_map_blocks+0x132/0x550 #5: (&ei->i_es_lock){++++-.}, at: [] ext4_es_insert_extent+0x71/0x180 stack backtrace: CPU: 0 PID: 4356 Comm: kworker/u24:0 Tainted: G O 3.16.0-rc2-mm1+ #7 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Workqueue: writeback bdi_writeback_workfn (flush-253:0) ffffffff8213dce0 ffff880014b07538 ffffffff815df0bb 0000000000000007 ffffffff8213e040 ffff880014b07588 ffffffff815db3dd ffff880014b07568 ffff880014b07610 ffff88003b868930 ffff88003b868908 ffff88003b868930 Call Trace: [] dump_stack+0x4e/0x68 [] print_circular_bug+0x1fb/0x20c [] __lock_acquire+0x163e/0x1d00 [] ? retint_restore_args+0xe/0xe [] ? __slab_alloc+0x4a8/0x4ce [] ? __ext4_es_shrink+0x4f/0x2e0 [] lock_acquire+0x87/0x120 [] ? __ext4_es_shrink+0x4f/0x2e0 [] ? ext4_es_free_extent+0x5d/0x70 [] _raw_spin_lock+0x39/0x50 [] ? __ext4_es_shrink+0x4f/0x2e0 [] ? kmem_cache_alloc+0x18b/0x1a0 [] __ext4_es_shrink+0x4f/0x2e0 [] ext4_es_insert_extent+0xc8/0x180 [] ext4_map_blocks+0x1c4/0x550 [] ext4_writepages+0x6d4/0xd00 ... Reported-by: Minchan Kim Signed-off-by: Theodore Ts'o Reported-by: Minchan Kim Cc: stable@vger.kernel.org Cc: Zheng Liu --- fs/ext4/extents_status.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3f5c188953a4..0b7e28e7eaa4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -966,10 +966,10 @@ retry: continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei) + if (ei->i_es_lru_nr == 0 || ei == locked_ei || + !write_trylock(&ei->i_es_lock)) continue; - write_lock(&ei->i_es_lock); shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); -- cgit v1.2.2 From bf40c92635d63fcc574c52649f7cda13e0418ac1 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 12 Jul 2014 16:11:42 -0400 Subject: ext4: fix potential null pointer dereference in ext4_free_inode Fix potential null pointer dereferencing problem caused by e43bb4e612 ("ext4: decrement free clusters/inodes counters when block group declared bad") Reported-by: Dan Carpenter Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Theodore Ts'o Reviewed-by: Lukas Czerner --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0840bf321cdb..5b87fc36aab8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -338,7 +338,7 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, -- cgit v1.2.2 From 17089a29a25a3bfe8d14520cd866b7d635ffe5ba Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 11 Jul 2014 10:20:45 -0400 Subject: nfs: mark nfs_page reqs with flag for extra ref Change the use of PG_INODE_REF - set it when taking extra reference on subrequests and take care to only release once for each request. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 4 +++- fs/nfs/write.c | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b6ee3a6ee96d..7368b2130a41 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -251,8 +251,10 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) /* grab extra ref if head request has extra ref from * the write/commit path to handle handoff between write * and commit lists */ - if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { + set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); + } } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 98ff061ccaf3..8e5745a4deff 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -448,7 +448,9 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); } nfsi->npages++; - set_bit(PG_INODE_REF, &req->wb_flags); + /* this a head request for a page group - mark it as having an + * extra reference so sub groups can follow suit */ + WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); } @@ -474,7 +476,9 @@ static void nfs_inode_remove_request(struct nfs_page *req) nfsi->npages--; spin_unlock(&inode->i_lock); } - nfs_release_request(req); + + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + nfs_release_request(req); } static void -- cgit v1.2.2 From 85710a837c2026aae80b7c64187edf1f10027b0b Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 11 Jul 2014 10:20:46 -0400 Subject: nfs: nfs_page should take a ref on the head req nfs_pages that aren't the the head of a group must take a reference on the head as long as ->wb_head is set to it. This stops the head from hitting a refcount of 0 while there is still an active nfs_page for the page group. This avoids kref warnings in the writeback code when the page group head is found and referenced. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7368b2130a41..05a63593a61f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -239,15 +239,21 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) WARN_ON_ONCE(prev == req); if (!prev) { + /* a head request */ req->wb_head = req; req->wb_this_page = req; } else { + /* a subrequest */ WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); req->wb_head = prev->wb_head; req->wb_this_page = prev->wb_this_page; prev->wb_this_page = req; + /* All subrequests take a ref on the head request until + * nfs_page_group_destroy is called */ + kref_get(&req->wb_head->wb_kref); + /* grab extra ref if head request has extra ref from * the write/commit path to handle handoff between write * and commit lists */ @@ -271,6 +277,10 @@ nfs_page_group_destroy(struct kref *kref) struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); struct nfs_page *tmp, *next; + /* subrequests must release the ref on the head request */ + if (req->wb_head != req) + nfs_release_request(req->wb_head); + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) return; -- cgit v1.2.2 From 84d3a9a913ba6a90c79b7763d063bb42554a8906 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 11 Jul 2014 10:20:47 -0400 Subject: nfs: change find_request to find_head_request nfs_page_find_request_locked* should find the head request for that page. Rename the functions and add comments to make this clear, and fix a bug that could return a subrequest when page_private isn't set on the page. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 8e5745a4deff..53c4a9917dac 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -91,8 +91,15 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */ static struct nfs_page * -nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) { struct nfs_page *req = NULL; @@ -104,25 +111,33 @@ nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) /* Linearly search the commit list for the correct req */ list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { if (freq->wb_page == page) { - req = freq; + req = freq->wb_head; break; } } } - if (req) + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } return req; } -static struct nfs_page *nfs_page_find_request(struct page *page) +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); spin_unlock(&inode->i_lock); return req; } @@ -282,7 +297,7 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); if (req == NULL) break; if (nfs_lock_request(req)) @@ -773,7 +788,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); if (req == NULL) goto out_unlock; @@ -881,7 +896,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_request(page); + req = nfs_page_find_head_request(page); if (req == NULL) return 0; l_ctx = req->wb_lock_context; @@ -1575,7 +1590,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) for (;;) { wait_on_page_writeback(page); - req = nfs_page_find_request(page); + req = nfs_page_find_head_request(page); if (req == NULL) break; if (nfs_lock_request(req)) { -- cgit v1.2.2 From d458138353726ea6dcbc53ae3597e489d0432c25 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 11 Jul 2014 10:20:48 -0400 Subject: nfs: handle multiple reqs in nfs_page_async_flush Change nfs_find_and_lock_request so nfs_page_async_flush can handle multiple requests in a page. There is only one request for a page the first time nfs_page_async_flush is called, but if a write or commit fails, async_flush is called again and there may be multiple requests associated with the page. The solution is to merge all the requests in a page group into a single request before calling nfs_pageio_add_request. Rename nfs_find_and_lock_request to nfs_lock_and_join_requests and change it to first lock all requests for the page, then cancel and merge all subrequests into the head request. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 1 + fs/nfs/pagelist.c | 4 +- fs/nfs/write.c | 255 +++++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 235 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 82ddbf46660e..f415cbf9f6c3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -244,6 +244,7 @@ void nfs_pgio_data_release(struct nfs_pgio_data *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, const struct rpc_call_ops *, int, int); +void nfs_free_request(struct nfs_page *req); static inline void nfs_iocounter_init(struct nfs_io_counter *c) { diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 05a63593a61f..0aefc8102b6b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -29,8 +29,6 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; -static void nfs_free_request(struct nfs_page *); - static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) { p->npages = pagecount; @@ -406,7 +404,7 @@ static void nfs_clear_request(struct nfs_page *req) * * Note: Should never be called with the spinlock held! */ -static void nfs_free_request(struct nfs_page *req) +void nfs_free_request(struct nfs_page *req) { WARN_ON_ONCE(req->wb_this_page != req); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53c4a9917dac..9f4424c464a0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -46,6 +46,7 @@ static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -289,36 +290,246 @@ static void nfs_end_page_writeback(struct nfs_page *req) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) + +/* nfs_page_group_clear_bits + * @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req) +{ + clear_bit(PG_TEARDOWN, &req->wb_flags); + clear_bit(PG_UNLOCKPAGE, &req->wb_flags); + clear_bit(PG_UPTODATE, &req->wb_flags); + clear_bit(PG_WB_END, &req->wb_flags); + clear_bit(PG_REMOVE, &req->wb_flags); +} + + +/* + * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head - head request of page group, must be holding head lock + * @req - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + * and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. + */ +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, + struct nfs_page *req, bool nonblock) + __releases(&inode->i_lock) +{ + struct nfs_page *tmp; + int ret; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) + nfs_unlock_request(tmp); + + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + + /* grab a ref on the request that will be waited on */ + kref_get(&req->wb_kref); + + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + + /* release ref from nfs_page_find_head_request_locked */ + nfs_release_request(head); + + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; + nfs_release_request(req); + + return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, + struct nfs_page *old_head) +{ + while (destroy_list) { + struct nfs_page *subreq = destroy_list; + + destroy_list = (subreq->wb_this_page == old_head) ? + NULL : subreq->wb_this_page; + + WARN_ON_ONCE(old_head != subreq->wb_head); + + /* make sure old group is not used */ + subreq->wb_head = subreq; + subreq->wb_this_page = subreq; + + nfs_clear_request_commit(subreq); + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_request(subreq); + + if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { + /* release ref on old head request */ + nfs_release_request(old_head); + + nfs_page_group_clear_bits(subreq); + + /* release the PG_INODE_REF reference */ + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) + nfs_release_request(subreq); + else + WARN_ON_ONCE(1); + } else { + WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); + /* zombie requests have already released the last + * reference and were waiting on the rest of the + * group to complete. Since it's no longer part of a + * group, simply free the request */ + nfs_page_group_clear_bits(subreq); + nfs_free_request(subreq); + } + } +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + * a locked reference, cancelling any pending + * operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock) { struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req; + struct nfs_page *head, *subreq; + struct nfs_page *destroy_list = NULL; + unsigned int total_bytes; int ret; +try_again: + total_bytes = 0; + + WARN_ON_ONCE(destroy_list); + spin_lock(&inode->i_lock); - for (;;) { - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - if (req == NULL) - break; - if (nfs_lock_request(req)) - break; - /* Note: If we hold the page lock, as is the case in nfs_writepage, - * then the call to nfs_lock_request() will always - * succeed provided that someone hasn't already marked the - * request as dirty (in which case we don't care). - */ + + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_page_find_head_request_locked(NFS_I(inode), page); + + if (!head) { spin_unlock(&inode->i_lock); - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; - nfs_release_request(req); - if (ret != 0) + return NULL; + } + + /* lock each request in the page group */ + nfs_page_group_lock(head); + subreq = head; + do { + /* + * Subrequests are always contiguous, non overlapping + * and in order. If not, it's a programming error. + */ + WARN_ON_ONCE(subreq->wb_offset != + (head->wb_offset + total_bytes)); + + /* keep track of how many bytes this group covers */ + total_bytes += subreq->wb_bytes; + + if (!nfs_lock_request(subreq)) { + /* releases page group bit lock and + * inode spin lock and all references */ + ret = nfs_unroll_locks_and_wait(inode, head, + subreq, nonblock); + + if (ret == 0) + goto try_again; + return ERR_PTR(ret); - spin_lock(&inode->i_lock); + } + + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* Now that all requests are locked, make sure they aren't on any list. + * Commit list removal accounting is done after locks are dropped */ + subreq = head; + do { + nfs_list_remove_request(subreq); + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* unlink subrequests from head, destroy them later */ + if (head->wb_this_page != head) { + /* destroy list will be terminated by head */ + destroy_list = head->wb_this_page; + head->wb_this_page = head; + + /* change head request to cover whole range that + * the former page group covered */ + head->wb_bytes = total_bytes; } + + /* + * prepare head request to be added to new pgio descriptor + */ + nfs_page_group_clear_bits(head); + + /* + * some part of the group was still on the inode list - otherwise + * the group wouldn't be involved in async write. + * grab a reference for the head request, iff it needs one. + */ + if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + kref_get(&head->wb_kref); + + nfs_page_group_unlock(head); + + /* drop lock to clear_request_commit the head req and clean up + * requests on destroy list */ spin_unlock(&inode->i_lock); - return req; + + nfs_destroy_unlinked_subrequests(destroy_list, head); + + /* clean up commit list state */ + nfs_clear_request_commit(head); + + /* still holds ref on head from nfs_page_find_head_request_locked + * and still has lock on head from lock loop */ + return head; } /* @@ -331,7 +542,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct nfs_page *req; int ret = 0; - req = nfs_find_and_lock_request(page, nonblock); + req = nfs_lock_and_join_requests(page, nonblock); if (!req) goto out; ret = PTR_ERR(req); -- cgit v1.2.2 From 3e2170451e91327bfa8a82040fea78043847533a Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 11 Jul 2014 10:20:49 -0400 Subject: nfs: handle multiple reqs in nfs_wb_page_cancel Use nfs_lock_and_join_requests to merge all subrequests into the head request - this cancels and dereferences all subrequests. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9f4424c464a0..bdc4db23951e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1799,27 +1799,28 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) struct nfs_page *req; int ret = 0; - for (;;) { - wait_on_page_writeback(page); - req = nfs_page_find_head_request(page); - if (req == NULL) - break; - if (nfs_lock_request(req)) { - nfs_clear_request_commit(req); - nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); - nfs_unlock_and_release_request(req); - break; - } - ret = nfs_wait_on_request(req); - nfs_release_request(req); - if (ret < 0) - break; + wait_on_page_writeback(page); + + /* blocking call to cancel all requests and join to a single (head) + * request */ + req = nfs_lock_and_join_requests(page, false); + + if (IS_ERR(req)) { + ret = PTR_ERR(req); + } else if (req) { + /* all requests from this page have been cancelled by + * nfs_lock_and_join_requests, so just remove the head + * request from the inode / page_private pointer and + * release it */ + nfs_inode_remove_request(req); + /* + * In case nfs_inode_remove_request has marked the + * page as being dirty + */ + cancel_dirty_page(page, PAGE_CACHE_SIZE); + nfs_unlock_and_release_request(req); } + return ret; } -- cgit v1.2.2 From aafe37504c70954fc104c88d9d15d553572dae69 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 12 Jul 2014 17:23:39 -0400 Subject: NFS: Remove 2 unused variables Cc: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 -- fs/nfs/write.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8f98138cbc43..f11b9eed0de1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -756,7 +756,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { - bool do_destroy = true; req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); @@ -765,7 +764,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) case NFS_IOHDR_NEED_COMMIT: kref_get(&req->wb_kref); nfs_mark_request_commit(req, hdr->lseg, &cinfo); - do_destroy = false; } nfs_unlock_and_release_request(req); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bdc4db23951e..5e2f10304548 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -868,7 +868,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) { struct nfs_commit_info cinfo; unsigned long bytes = 0; - bool do_destroy; if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) goto out; @@ -898,7 +897,6 @@ remove_req: next: nfs_unlock_request(req); nfs_end_page_writeback(req); - do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags); nfs_release_request(req); } out: -- cgit v1.2.2 From f563b89b182594f827b4100bd34f916339785a77 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 13 Jul 2014 15:13:19 -0400 Subject: NFS: Don't reset pg_moreio in __nfs_pageio_add_request Once we've started sending unstable NFS writes, we do not want to clear pg_moreio, or we may end up sending the very last request as a stable write if the commit lists are still empty. Do, however, reset pg_moreio in the case where we end up having to recoalesce the write if an attempt to use pNFS failed. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 0aefc8102b6b..17fab89f6358 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -935,7 +935,6 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_pageio_doio(desc); if (desc->pg_error < 0) return 0; - desc->pg_moreio = 0; if (desc->pg_recoalesce) return 0; /* retry add_request for this subreq */ @@ -982,6 +981,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) desc->pg_count = 0; desc->pg_base = 0; desc->pg_recoalesce = 0; + desc->pg_moreio = 0; while (!list_empty(&head)) { struct nfs_page *req; -- cgit v1.2.2 From 27f1b36326bc8b6911e7052bc4b80a10234f0ec5 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 10 Jul 2014 15:32:43 +0400 Subject: fuse: release temporary page if fuse_writepage_locked() failed tmp_page to be freed if fuse_write_file_get() returns NULL. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 96d513e01a5d..35b6f31ecc38 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1722,7 +1722,7 @@ static int fuse_writepage_locked(struct page *page) error = -EIO; req->ff = fuse_write_file_get(fc, fi); if (!req->ff) - goto err_free; + goto err_nofile; fuse_write_fill(req, req->ff, page_offset(page), 0); @@ -1750,6 +1750,8 @@ static int fuse_writepage_locked(struct page *page) return 0; +err_nofile: + __free_page(tmp_page); err_free: fuse_request_free(req); err: -- cgit v1.2.2 From f2b3455e47c72bef80fe584adb9bb9bc5afdd99c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 23 Jun 2014 18:35:15 +0200 Subject: fuse: replace count*size kzalloc by kcalloc kcalloc manages count*sizeof overflow. Signed-off-by: Fabian Frederick Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 35b6f31ecc38..1570dad1915d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1992,8 +1992,8 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kzalloc(sizeof(struct page *) * - FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) goto out; -- cgit v1.2.2 From 263782c1c95bbddbb022dc092fd89a36bb8d5577 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Mon, 14 Jul 2014 12:49:26 -0400 Subject: aio: protect reqs_available updates from changes in interrupt handlers As of commit f8567a3845ac05bb28f3c1b478ef752762bd39ef it is now possible to have put_reqs_available() called from irq context. While put_reqs_available() is per cpu, it did not protect itself from interrupts on the same CPU. This lead to aio_complete() corrupting the available io requests count when run under a heavy O_DIRECT workloads as reported by Robert Elliott. Fix this by disabling irq updates around the per cpu batch updates of reqs_available. Many thanks to Robert and folks for testing and tracking this down. Reported-by: Robert Elliot Tested-by: Robert Elliot Signed-off-by: Benjamin LaHaise Cc: Jens Axboe , Christoph Hellwig Cc: stable@vger.kenel.org --- fs/aio.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 955947ef3e02..1c9c5f0a9e2b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -830,16 +830,20 @@ void exit_aio(struct mm_struct *mm) static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } + local_irq_restore(flags); preempt_enable(); } @@ -847,10 +851,12 @@ static bool get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -869,6 +875,7 @@ static bool get_reqs_available(struct kioctx *ctx) ret = true; kcpu->reqs_available--; out: + local_irq_restore(flags); preempt_enable(); return ret; } -- cgit v1.2.2 From aa182e64f16fc29a4984c2d79191b161888bbd9b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 15 Jul 2014 07:08:10 +1000 Subject: Revert "xfs: block allocation work needs to be kswapd aware" This reverts commit 1f6d64829db78a7e1d63e15c9f48f0a5d2b5a679. This commit resulted in regressions in performance in low memory situations where kswapd was doing writeback of delayed allocation blocks. It resulted in significant parallelism of the kswapd work and with the special kswapd flags meant that hundreds of active allocation could dip into kswapd specific memory reserves and avoid being throttled. This cause a large amount of performance variation, as well as random OOM-killer invocations that didn't previously exist. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 16 +++------------- fs/xfs/xfs_bmap_util.h | 13 ++++++------- 2 files changed, 9 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 703b3ec1796c..057f671811d6 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -258,23 +258,14 @@ xfs_bmapi_allocate_worker( struct xfs_bmalloca *args = container_of(work, struct xfs_bmalloca, work); unsigned long pflags; - unsigned long new_pflags = PF_FSTRANS; - /* - * we are in a transaction context here, but may also be doing work - * in kswapd context, and hence we may need to inherit that state - * temporarily to ensure that we don't block waiting for memory reclaim - * in any way. - */ - if (args->kswapd) - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; - - current_set_flags_nested(&pflags, new_pflags); + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); args->result = __xfs_bmapi_allocate(args); complete(args->done); - current_restore_flags_nested(&pflags, new_pflags); + current_restore_flags_nested(&pflags, PF_FSTRANS); } /* @@ -293,7 +284,6 @@ xfs_bmapi_allocate( args->done = &done; - args->kswapd = current_is_kswapd(); INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); queue_work(xfs_alloc_wq, &args->work); wait_for_completion(&done); diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 075f72232a64..935ed2b24edf 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -50,13 +50,12 @@ struct xfs_bmalloca { xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ - bool eof; /* set if allocating past last extent */ - bool wasdel; /* replacing a delayed allocation */ - bool userdata;/* set if is user data */ - bool aeof; /* allocated space at eof */ - bool conv; /* overwriting unwritten extents */ - bool stack_switch; - bool kswapd; /* allocation in kswapd context */ + char eof; /* set if allocating past last extent */ + char wasdel; /* replacing a delayed allocation */ + char userdata;/* set if is user data */ + char aeof; /* allocated space at eof */ + char conv; /* overwriting unwritten extents */ + char stack_switch; int flags; struct completion *done; struct work_struct work; -- cgit v1.2.2 From cf11da9c5d374962913ca5ba0ce0886b58286224 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 15 Jul 2014 07:08:24 +1000 Subject: xfs: refine the allocation stack switch The allocation stack switch at xfs_bmapi_allocate() has served it's purpose, but is no longer a sufficient solution to the stack usage problem we have in the XFS allocation path. Whilst the kernel stack size is now 16k, that is not a valid reason for undoing all our "keep stack usage down" modifications. What it does allow us to do is have the freedom to refine and perfect the modifications knowing that if we get it wrong it won't blow up in our faces - we have a safety net now. This is important because we still have the issue of older kernels having smaller stacks and that they are still supported and are demonstrating a wide range of different stack overflows. Red Hat has several open bugs for allocation based stack overflows from directory modifications and direct IO block allocation and these problems still need to be solved. If we can solve them upstream, then distro's won't need to bake their own unique solutions. To that end, I've observed that every allocation based stack overflow report has had a specific characteristic - it has happened during or directly after a bmap btree block split. That event requires a new block to be allocated to the tree, and so we effectively stack one allocation stack on top of another, and that's when we get into trouble. A further observation is that bmap btree block splits are much rarer than writeback allocation - over a range of different workloads I've observed the ratio of bmap btree inserts to splits ranges from 100:1 (xfstests run) to 10000:1 (local VM image server with sparse files that range in the hundreds of thousands to millions of extents). Either way, bmap btree split events are much, much rarer than allocation events. Finally, we have to move the kswapd state to the allocation workqueue work when allocation is done on behalf of kswapd. This is proving to cause significant perturbation in performance under memory pressure and appears to be generating allocation deadlock warnings under some workloads, so avoiding the use of a workqueue for the majority of kswapd writeback allocation will minimise the impact of such behaviour. Hence it makes sense to move the stack switch to xfs_btree_split() and only do it for bmap btree splits. Stack switches during allocation will be much rarer, so there won't be significant performacne overhead caused by switching stacks. The worse case stack from all allocation paths will be split, not just writeback. And the majority of memory allocations will be done in the correct context (e.g. kswapd) without causing additional latency, and so we simplify the memory reclaim interactions between processes, workqueues and kswapd. The worst stack I've been able to generate with this patch in place is 5600 bytes deep. It's very revealing because we exit XFS at: 37) 1768 64 kmem_cache_alloc+0x13b/0x170 about 1800 bytes of stack consumed, and the remaining 3800 bytes (and 36 functions) is memory reclaim, swap and the IO stack. And this occurs in the inode allocation from an open(O_CREAT) syscall, not writeback. The amount of stack being used is much less than I've previously be able to generate - fs_mark testing has been able to generate stack usage of around 7k without too much trouble; with this patch it's only just getting to 5.5k. This is primarily because the metadata allocation paths (e.g. directory blocks) are no longer causing double splits on the same stack, and hence now stack tracing is showing swapping being the worst stack consumer rather than XFS. Performance of fs_mark inode create workloads is unchanged. Performance of fs_mark async fsync workloads is consistently good with context switches reduced by around 150,000/s (30%). Performance of dbench, streaming IO and postmark is unchanged. Allocation deadlock warnings have not been seen on the workloads that generated them since adding this patch. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap.c | 7 ++--- fs/xfs/xfs_bmap.h | 4 +-- fs/xfs/xfs_bmap_util.c | 43 -------------------------- fs/xfs/xfs_bmap_util.h | 13 +++----- fs/xfs/xfs_btree.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_iomap.c | 3 +- 6 files changed, 90 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 96175df211b1..75c3fe5f3d9d 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4298,8 +4298,8 @@ xfs_bmapi_delay( } -int -__xfs_bmapi_allocate( +static int +xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; @@ -4578,9 +4578,6 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; - if (flags & XFS_BMAPI_STACK_SWITCH) - bma.stack_switch = 1; - while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 38ba36e9b2f0..b879ca56a64c 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,7 +77,6 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 -#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -86,8 +85,7 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } + { XFS_BMAPI_CONVERT, "CONVERT" } static inline int xfs_bmapi_aflag(int w) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 057f671811d6..64731ef3324d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -248,49 +248,6 @@ xfs_bmap_rtalloc( return 0; } -/* - * Stack switching interfaces for allocation - */ -static void -xfs_bmapi_allocate_worker( - struct work_struct *work) -{ - struct xfs_bmalloca *args = container_of(work, - struct xfs_bmalloca, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_bmapi_allocate(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Some allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Otherwise just - * call directly to avoid the context switch overhead here. - */ -int -xfs_bmapi_allocate( - struct xfs_bmalloca *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_bmapi_allocate(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - destroy_work_on_stack(&args->work); - return args->result; -} - /* * Check if the endoff is outside the last extent. If so the caller will grow * the allocation to a stripe unit boundary. All offsets are considered outside diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 935ed2b24edf..2fdb72d2c908 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -50,12 +50,11 @@ struct xfs_bmalloca { xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ - char eof; /* set if allocating past last extent */ - char wasdel; /* replacing a delayed allocation */ - char userdata;/* set if is user data */ - char aeof; /* allocated space at eof */ - char conv; /* overwriting unwritten extents */ - char stack_switch; + bool eof; /* set if allocating past last extent */ + bool wasdel; /* replacing a delayed allocation */ + bool userdata;/* set if is user data */ + bool aeof; /* allocated space at eof */ + bool conv; /* overwriting unwritten extents */ int flags; struct completion *done; struct work_struct work; @@ -65,8 +64,6 @@ struct xfs_bmalloca { int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, int *committed); int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); -int xfs_bmapi_allocate(struct xfs_bmalloca *args); -int __xfs_bmapi_allocate(struct xfs_bmalloca *args); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index bf810c6baf2b..cf893bc1e373 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -33,6 +33,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_alloc.h" /* * Cursor allocation zone. @@ -2323,7 +2324,7 @@ error1: * record (to be inserted into parent). */ STATIC int /* error */ -xfs_btree_split( +__xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, @@ -2503,6 +2504,85 @@ error0: return error; } +struct xfs_btree_split_args { + struct xfs_btree_cur *cur; + int level; + union xfs_btree_ptr *ptrp; + union xfs_btree_key *key; + struct xfs_btree_cur **curp; + int *stat; /* success/failure */ + int result; + bool kswapd; /* allocation in kswapd context */ + struct completion *done; + struct work_struct work; +}; + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_btree_split_worker( + struct work_struct *work) +{ + struct xfs_btree_split_args *args = container_of(work, + struct xfs_btree_split_args, work); + unsigned long pflags; + unsigned long new_pflags = PF_FSTRANS; + + /* + * we are in a transaction context here, but may also be doing work + * in kswapd context, and hence we may need to inherit that state + * temporarily to ensure that we don't block waiting for memory reclaim + * in any way. + */ + if (args->kswapd) + new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + current_set_flags_nested(&pflags, new_pflags); + + args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, + args->key, args->curp, args->stat); + complete(args->done); + + current_restore_flags_nested(&pflags, new_pflags); +} + +/* + * BMBT split requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + + if (cur->bc_btnum != XFS_BTNUM_BMAP) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; + args.level = level; + args.ptrp = ptrp; + args.key = key; + args.curp = curp; + args.stat = stat; + args.done = &done; + args.kswapd = current_is_kswapd(); + INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); + queue_work(xfs_alloc_wq, &args.work); + wait_for_completion(&done); + destroy_work_on_stack(&args.work); + return args.result; +} + + /* * Copy the old inode root contents into a real block and make the * broot point to it. diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6c5eb4c551e3..6d3ec2b6ee29 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -749,8 +749,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, - XFS_BMAPI_STACK_SWITCH, + count_fsb, 0, &first_block, 1, imap, &nimaps, &free_list); if (error) -- cgit v1.2.2 From 03e01349c654fbdea80d3d9b4ab599244eb55bb7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 15 Jul 2014 07:28:41 +1000 Subject: xfs: null unused quota inodes when quota is on When quota is on, it is expected that unused quota inodes have a value of NULLFSINO. The changes to support a separate project quota in 3.12 broken this rule for non-project quota inode enabled filesystem, as the code now refuses to write the group quota inode if neither group or project quotas are enabled. This regression was introduced by commit d892d58 ("xfs: Start using pquotaino from the superblock"). In this case, we should be writing NULLFSINO rather than nothing to ensure that we leave the group quota inode in a valid state while quotas are enabled. Failure to do so doesn't cause a current kernel to break - the separate project quota inodes introduced translation code to always treat a zero inode as NULLFSINO. This was introduced by commit 0102629 ("xfs: Initialize all quota inodes to be NULLFSINO") with is also in 3.12 but older kernels do not do this and hence taking a filesystem back to an older kernel can result in quotas failing initialisation at mount time. When that happens, we see this in dmesg: [ 1649.215390] XFS (sdb): Mounting Filesystem [ 1649.316894] XFS (sdb): Failed to initialize disk quotas. [ 1649.316902] XFS (sdb): Ending clean mount By ensuring that we write NULLFSINO to quota inodes that aren't active, we avoid this problem. We have to be really careful when determining if the quota inodes are active or not, because we don't want to write a NULLFSINO if the quota inodes are active and we simply aren't updating them. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_sb.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index c3453b11f563..7703fa6770ff 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -483,10 +483,16 @@ xfs_sb_quota_to_disk( } /* - * GQUOTINO and PQUOTINO cannot be used together in versions - * of superblock that do not have pquotino. from->sb_flags - * tells us which quota is active and should be copied to - * disk. + * GQUOTINO and PQUOTINO cannot be used together in versions of + * superblock that do not have pquotino. from->sb_flags tells us which + * quota is active and should be copied to disk. If neither are active, + * make sure we write NULLFSINO to the sb_gquotino field as a quota + * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature + * bit is set. + * + * Note that we don't need to handle the sb_uquotino or sb_pquotino here + * as they do not require any translation. Hence the main sb field loop + * will write them appropriately from the in-core superblock. */ if ((*fields & XFS_SB_GQUOTINO) && (from->sb_qflags & XFS_GQUOTA_ACCT)) @@ -494,6 +500,17 @@ xfs_sb_quota_to_disk( else if ((*fields & XFS_SB_PQUOTINO) && (from->sb_qflags & XFS_PQUOTA_ACCT)) to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + else { + /* + * We can't rely on just the fields being logged to tell us + * that it is safe to write NULLFSINO - we should only do that + * if quotas are not actually enabled. Hence only write + * NULLFSINO if both in-core quota inodes are NULL. + */ + if (from->sb_gquotino == NULLFSINO && + from->sb_pquotino == NULLFSINO) + to->sb_gquotino = cpu_to_be64(NULLFSINO); + } *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); } -- cgit v1.2.2 From d68aab6b8f572406aa93b45ef6483934dd3b54a6 Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Wed, 4 Jun 2014 12:22:13 +0800 Subject: quota: missing lock in dqcache_shrink_scan() Commit 1ab6c4997e04 (fs: convert fs shrinkers to new scan/count API) accidentally removed locking from quota shrinker. Fix it - dqcache_shrink_scan() should use dq_list_lock to protect the scan on free_dquots list. CC: stable@vger.kernel.org Fixes: 1ab6c4997e04a00c50c6d786c2f046adc0d1f5de Signed-off-by: Niu Yawei Signed-off-by: Jan Kara --- fs/quota/dquot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9cd5f63715c0..7f30bdc57d13 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -702,6 +702,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct dquot *dquot; unsigned long freed = 0; + spin_lock(&dq_list_lock); head = free_dquots.prev; while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); @@ -713,6 +714,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed++; head = free_dquots.prev; } + spin_unlock(&dq_list_lock); return freed; } -- cgit v1.2.2 From 79272b3562bb44ce7dc720cd13136f5a4a53c618 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 20 Jun 2014 09:36:41 -0400 Subject: GFS2: Only wait for demote when last holder is dequeued Function gfs2_glock_dq_wait is supposed to dequeue a glock and then wait for the lock to be demoted. The problem is, if this is a shared lock, its demote will depend on the other holders, which means you might end up waiting forever because the other process is blocked. This problem is especially apparent when dealing with nested flocks. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c355f7320e44..278fae5b6982 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1128,7 +1128,9 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); + if (!find_first_holder(gl)) + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, + TASK_UNINTERRUPTIBLE); } /** -- cgit v1.2.2 From 94a09a3999ee978e097b5aad74034ed43bae56db Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 23 Jun 2014 14:43:32 +0100 Subject: GFS2: Fix race in glock lru glock disposal We must not leave items on the LRU list with GLF_LOCK set, since they can be removed if the glock is brought back into use, which may then potentially result in a hang, waiting for GLF_LOCK to clear. It doesn't happen very often, since it requires a glock that has not been used for a long time to be brought back into use at the same moment that the shrinker is part way through disposing of glocks. The fix is to set GLF_LOCK at a later time, when we already know that the other locks can be obtained. Also, we now only release the lru_lock in case a resched is needed, rather than on every iteration. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 278fae5b6982..c1e5b126d2ca 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1406,12 +1406,16 @@ __acquires(&lru_lock) gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); if (!spin_trylock(&gl->gl_spin)) { +add_back_to_lru: list_add(&gl->gl_lru, &lru_list); atomic_inc(&lru_count); continue; } + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + spin_unlock(&gl->gl_spin); + goto add_back_to_lru; + } clear_bit(GLF_LRU, &gl->gl_flags); - spin_unlock(&lru_lock); gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1419,7 +1423,7 @@ __acquires(&lru_lock) if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gl->gl_lockref.count--; spin_unlock(&gl->gl_spin); - spin_lock(&lru_lock); + cond_resched_lock(&lru_lock); } } @@ -1444,7 +1448,7 @@ static long gfs2_scan_glock_lru(int nr) gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); /* Test for being demotable */ - if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; -- cgit v1.2.2 From fe0bbd2986996b9efe3a78bf5a591b0496c7afea Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 23 Jun 2014 14:50:20 +0100 Subject: GFS2: Use GFP_NOFS when allocating glocks Normally GFP_KERNEL is ok here, but there is now a rarely used code path relating to deallocation of unlinked inodes (in certain corner cases) which if hit at times of memory shortage can cause recursion while trying to free memory. One solution would be to try and move the gfs2_glock_get() call so that it is no longer called while another glock is held, but that doesn't look at all easy, so GFP_NOFS is the best solution for the time being. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c1e5b126d2ca..b703dcc91588 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -731,14 +731,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, cachep = gfs2_glock_aspace_cachep; else cachep = gfs2_glock_cachep; - gl = kmem_cache_alloc(cachep, GFP_KERNEL); + gl = kmem_cache_alloc(cachep, GFP_NOFS); if (!gl) return -ENOMEM; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); if (glops->go_flags & GLOF_LVB) { - gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; -- cgit v1.2.2 From 6ec43b1838bd71633ac3f853c63ddf1f5940b1ed Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 25 Jun 2014 20:40:45 +0200 Subject: GFS2: replace count*size kzalloc by kcalloc kcalloc manages count*sizeof overflow. Cc: cluster-devel@redhat.com Signed-off-by: Fabian Frederick Signed-off-by: Steven Whitehouse --- fs/gfs2/lock_dlm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 91f274de1246..4fafea1c9ecf 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1036,8 +1036,8 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, new_size = old_size + RECOVER_SIZE_INC; - submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); - result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); if (!submit || !result) { kfree(submit); kfree(result); -- cgit v1.2.2 From 5bef3e7cf18c56cc733777c61b6b61a0b8a62b35 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 26 Jun 2014 10:46:25 -0400 Subject: GFS2: Allow flocks to use normal glock dq rather than dq_wait This patch allows flock glocks to use a non-blocking dequeue rather than dq_wait. It also reverts the previous patch I had posted regarding dq_wait. The reverted patch isn't necessarily a bad idea, but I decided this might avoid unforeseen side effects, and was therefore safer. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 2 +- fs/gfs2/glock.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4fc3a3046174..491e8e023598 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -991,7 +991,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) goto out; flock_lock_file_wait(file, &(struct file_lock){.fl_type = F_UNLCK}); - gfs2_glock_dq_wait(fl_gh); + gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b703dcc91588..ee4e04fe60fc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1128,9 +1128,7 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); - if (!find_first_holder(gl)) - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, - TASK_UNINTERRUPTIBLE); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); } /** -- cgit v1.2.2 From 97a4f1d7653684fff0d50e9328917506f06e9d79 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 26 Jun 2014 10:47:48 -0400 Subject: GFS2: Allow caching of glocks for flock This patch removes the GLF_NOCACHE flag from the glocks associated with flocks. There should be no good reason not to cache glocks for flocks: they only force the glock to be demoted before they can be reacquired, which can slow down performance and even cause glock hangs, especially in cases where the flocks are held in Shared (SH) mode. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 491e8e023598..26b3f952e6b1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -981,7 +981,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int error = 0; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; mutex_lock(&fp->f_fl_mutex); -- cgit v1.2.2 From 6b49d1d9c3c1088758c6a2758aaa5d236ef609e2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 29 Jun 2014 12:21:39 +0200 Subject: GFS2: memcontrol: Spelling s/invlidate/invalidate/ Signed-off-by: Geert Uytterhoeven Cc: cluster-devel@redhat.com Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fc1100781bbc..2ffc67dce87f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -234,8 +234,8 @@ static void inode_go_sync(struct gfs2_glock *gl) * inode_go_inval - prepare a inode glock to be released * @gl: the glock * @flags: - * - * Normally we invlidate everything, but if we are moving into + * + * Normally we invalidate everything, but if we are moving into * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we * can keep hold of the metadata, since it won't have changed. * -- cgit v1.2.2 From 27ff6a0f7f5bf500e9d2a8760c062789b52c551f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 2 Jul 2014 22:05:27 +0200 Subject: GFS2: fs/gfs2/rgrp.c: kernel-doc warning fixes Cc: cluster-devel@redhat.com Signed-off-by: Fabian Frederick Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index db629d1bd1bd..f4cb9c0d6bbd 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -337,7 +337,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le /** * gfs2_free_extlen - Return extent length of free blocks - * @rbm: Starting position + * @rrbm: Starting position * @len: Max length to check * * Starting at the block specified by the rbm, see how many free blocks @@ -2522,7 +2522,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) /** * gfs2_rlist_free - free a resource group list - * @list: the list of resource groups + * @rlist: the list of resource groups * */ -- cgit v1.2.2 From 98ce2deda23a303682a4253f3016a1436f4b2735 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 17 Jul 2014 16:08:36 +0800 Subject: Btrfs: fix abnormal long waiting in fsync xfstests generic/127 detected this problem. With commit 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8, now fsync will only flush data within the passed range. This is the cause of the above problem, -- btrfs's fsync has a stage called 'sync log' which will wait for all the ordered extents it've recorded to finish. In xfstests/generic/127, with mixed operations such as truncate, fallocate, punch hole, and mapwrite, we get some pre-allocated extents, and mapwrite will mmap, and then msync. And I find that msync will wait for quite a long time (about 20s in my case), thanks to ftrace, it turns out that the previous fallocate calls 'btrfs_wait_ordered_range()' to flush dirty pages, but as the range of dirty pages may be larger than 'btrfs_wait_ordered_range()' wants, there can be some ordered extents created but not getting corresponding pages flushed, then they're left in memory until we fsync which runs into the stage 'sync log', and fsync will just wait for the system writeback thread to flush those pages and get ordered extents finished, so the latency is inevitable. This adds a flush similar to btrfs_start_ordered_extent() in btrfs_wait_logged_extents() to fix that. Reviewed-by: Miao Xie Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e12441c7cf1d..7187b14faa6c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -484,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) log_list); list_del_init(&ordered->log_list); spin_unlock_irq(&log->log_extents_lock[index]); + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + struct inode *inode = ordered->inode; + u64 start = ordered->file_offset; + u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(!inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } -- cgit v1.2.2 From 0bfaa9c5cb479cebc24979b384374fe47500b4c9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 7 Jul 2014 12:34:49 -0500 Subject: btrfs: test for valid bdev before kobj removal in btrfs_rm_device commit 99994cd btrfs: dev delete should remove sysfs entry added a btrfs_kobj_rm_device, which dereferences device->bdev... right after we check whether device->bdev might be NULL. I don't honestly know if it's possible to have a NULL device->bdev here, but assuming that it is (given the test), we need to move the kobject removal to be under that test. (Coverity spotted this) Signed-off-by: Eric Sandeen Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6104676857f5..6cb82f62cb7c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1680,11 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev == root->fs_info->fs_devices->latest_bdev) root->fs_info->fs_devices->latest_bdev = next_device->bdev; - if (device->bdev) + if (device->bdev) { device->fs_devices->open_devices--; - - /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info, device); + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + } call_rcu(&device->rcu, free_device); -- cgit v1.2.2