From 4184ea7f908d95f329febc3665cf66da8568b467 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Mar 2009 12:39:20 -0400 Subject: Btrfs: Fix locking around adding new space_info Storage allocated to different raid levels in btrfs is tracked by a btrfs_space_info structure, and all of the current space_infos are collected into a list_head. Most filesystems have 3 or 4 of these structs total, and the list is only changed when new raid levels are added or at unmount time. This commit adds rcu locking on the list head, and properly frees things at unmount time. It also clears the space_info->full flag whenever new space is added to the FS. The locking for the space info list goes like this: reads: protected by rcu_read_lock() writes: protected by the chunk_mutex At unmount time we don't need special locking because all the readers are gone. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/extent-tree.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/volumes.c | 2 ++ 3 files changed, 53 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 82491ba8fa40..5e1d4e30e9d8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -784,7 +784,14 @@ struct btrfs_fs_info { struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; + + /* + * the space_info list is almost entirely read only. It only changes + * when we add a new raid type to the FS, and that happens + * very rarely. RCU is used to protect it. + */ struct list_head space_info; + spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; @@ -1797,6 +1804,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + int btrfs_check_metadata_free_space(struct btrfs_root *root); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9abf81f71c46..fefe83ad2059 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "compat.h" #include "hash.h" #include "crc32c.h" @@ -330,13 +331,33 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, { struct list_head *head = &info->space_info; struct btrfs_space_info *found; - list_for_each_entry(found, head, list) { - if (found->flags == flags) + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags == flags) { + rcu_read_unlock(); return found; + } } + rcu_read_unlock(); return NULL; } +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -1903,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - list_add(&found->list, &info->space_info); INIT_LIST_HEAD(&found->block_groups); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); @@ -1917,6 +1937,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = 0; *space_info = found; + list_add_rcu(&found->list, &info->space_info); return 0; } @@ -6320,6 +6341,7 @@ out: int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; struct rb_node *n; spin_lock(&info->block_group_cache_lock); @@ -6341,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) spin_lock(&info->block_group_cache_lock); } spin_unlock(&info->block_group_cache_lock); + + /* now that all the block groups are freed, go through and + * free all the space_info structs. This is only called during + * the final stages of unmount, and so we know nobody is + * using them. We call synchronize_rcu() once before we start, + * just to be on the safe side. + */ + synchronize_rcu(); + + while(!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + list_del(&space_info->list); + kfree(space_info); + } return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1316139bf9e8..7aa3810d7f69 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1459,6 +1459,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; + btrfs_clear_space_info_full(device->dev_root->fs_info); + return btrfs_update_device(trans, device); } -- cgit v1.2.2 From 913d952eb573c3d1f7487e83b5590e13e7cae2bd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Mar 2009 13:17:18 -0400 Subject: Btrfs: Clear space_info full when adding new devices The full flag on the space info structs tells the allocator not to try and allocate more chunks because the devices in the FS are fully allocated. When more devices are added, we need to clear the full flag so the allocator knows it has more space available. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7aa3810d7f69..dd06e18e5aac 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1374,6 +1374,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_add_device(trans, root, device); } + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + unlock_chunks(root); btrfs_commit_transaction(trans, root); -- cgit v1.2.2 From 395a87bfefbc400011417e9eaae33169f9f036c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 10 Mar 2009 18:18:47 -0400 Subject: ext4: fix header check in ext4_ext_search_right() for deep extent trees. The ext4_ext_search_right() function is confusing; it uses a "depth" variable which is 0 at the root and maximum at the leaves, but the on-disk metadata uses a "depth" (actually eh_depth) which is opposite: maximum at the root, and 0 at the leaves. The ext4_ext_check_header() function is given a depth and checks the header agaisnt that depth; it expects the on-disk semantics, but we are giving it the opposite in the while loop in this function. We should be giving it the on-disk notion of "depth" which we can get from (p_depth - depth) - and if you look, the last (more commonly hit) call to ext4_ext_check_header() does just this. Sending in the wrong depth results in (incorrect) messages about corruption: EXT4-fs error (device sdb1): ext4_ext_search_right: bad header in inode #2621457: unexpected eh_depth - magic f30a, entries 340, max 340(0), depth 1(2) http://bugzilla.kernel.org/show_bug.cgi?id=12821 Reported-by: David Dindorp Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e2eab196875f..e0aa4fe4f596 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1122,7 +1122,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *ix; struct ext4_extent *ex; ext4_fsblk_t block; - int depth, ee_len; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; BUG_ON(path == NULL); depth = path->p_depth; @@ -1179,7 +1180,8 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, depth)) { + /* subtract from p_depth to get proper eh_depth */ + if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } -- cgit v1.2.2 From 2842c3b5449f31470b61db716f1926b594fb6156 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 12 Mar 2009 12:20:01 -0400 Subject: ext4: Print the find_group_flex() warning only once This is a short-term warning, and even printk_ratelimit() can result in too much noise in system logs. So only print it once as a warning. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 627f8c3337a3..2d2b3585ee91 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -698,6 +698,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct inode *ret; ext4_group_t i; int free = 0; + static int once = 1; ext4_group_t flex_group; /* Cannot create files in a deleted directory */ @@ -719,7 +720,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { ret2 = find_group_other(sb, dir, &group); - if (ret2 == 0 && printk_ratelimit()) + if (ret2 == 0 && once) + once = 0; printk(KERN_NOTICE "ext4: find_group_flex " "failed, fallback succeeded dir %lu\n", dir->i_ino); -- cgit v1.2.2 From 8d03c7a0c550e7ab24cadcef5e66656bfadec8b9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 14 Mar 2009 11:51:46 -0400 Subject: ext4: fix bogus BUG_ONs in in mballoc code Thiemo Nagel reported that: # dd if=/dev/zero of=image.ext4 bs=1M count=2 # mkfs.ext4 -v -F -b 1024 -m 0 -g 512 -G 4 -I 128 -N 1 \ -O large_file,dir_index,flex_bg,extent,sparse_super image.ext4 # mount -o loop image.ext4 mnt/ # dd if=/dev/zero of=mnt/file oopsed, with a BUG_ON in ext4_mb_normalize_request because size == EXT4_BLOCKS_PER_GROUP It appears to me (esp. after talking to Andreas) that the BUG_ON is bogus; a request of exactly EXT4_BLOCKS_PER_GROUP should be allowed, though larger sizes do indicate a problem. Fix that an another (apparently rare) codepath with a similar check. Reported-by: Thiemo Nagel Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4415beeb0b62..41f4348b62f5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1447,7 +1447,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); @@ -3292,7 +3292,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ -- cgit v1.2.2 From ee6f779b9e0851e2f7da292a9f58e0095edf615a Mon Sep 17 00:00:00 2001 From: Zhang Le Date: Mon, 16 Mar 2009 14:44:31 +0800 Subject: filp->f_pos not correctly updated in proc_task_readdir filp->f_pos only get updated at the end of the function. Thus d_off of those dirents who are in the middle will be 0, and this will cause a problem in glibc's readdir implementation, specifically endless loop. Because when overflow occurs, f_pos will be set to next dirent to read, however it will be 0, unless the next one is the last one. So it will start over again and again. There is a sample program in man 2 gendents. This is the output of the program running on a multithread program's task dir before this patch is applied: $ ./a.out /proc/3807/task --------------- nread=128 --------------- i-node# file type d_reclen d_off d_name 506442 directory 16 1 . 506441 directory 16 0 .. 506443 directory 16 0 3807 506444 directory 16 0 3809 506445 directory 16 0 3812 506446 directory 16 0 3861 506447 directory 16 0 3862 506448 directory 16 8 3863 This is the output after this patch is applied $ ./a.out /proc/3807/task --------------- nread=128 --------------- i-node# file type d_reclen d_off d_name 506442 directory 16 1 . 506441 directory 16 2 .. 506443 directory 16 3 3807 506444 directory 16 4 3809 506445 directory 16 5 3812 506446 directory 16 6 3861 506447 directory 16 7 3862 506448 directory 16 8 3863 Signed-off-by: Zhang Le Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/proc/base.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0c9de19a1633..cc6ea2329e71 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3066,7 +3066,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi int retval = -ENOENT; ino_t ino; int tid; - unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ struct pid_namespace *ns; task = get_proc_task(inode); @@ -3083,18 +3082,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi goto out_no_task; retval = 0; - switch (pos) { + switch (filp->f_pos) { case 0: ino = inode->i_ino; - if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) + if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ case 1: ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) + if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ } @@ -3104,9 +3103,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi ns = filp->f_dentry->d_sb->s_fs_info; tid = (int)filp->f_version; filp->f_version = 0; - for (task = first_tid(leader, tid, pos - 2, ns); + for (task = first_tid(leader, tid, filp->f_pos - 2, ns); task; - task = next_tid(task), pos++) { + task = next_tid(task), filp->f_pos++) { tid = task_pid_nr_ns(task, ns); if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { /* returning this tgid failed, save it as the first @@ -3117,7 +3116,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi } } out: - filp->f_pos = pos; put_task_struct(leader); out_no_task: return retval; -- cgit v1.2.2 From d33a1976fbee1ee321d6f014333d8f03a39d526c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 16 Mar 2009 23:25:40 -0400 Subject: ext4: fix bb_prealloc_list corruption due to wrong group locking This is for Red Hat bug 490026: EXT4 panic, list corruption in ext4_mb_new_inode_pa ext4_lock_group(sb, group) is supposed to protect this list for each group, and a common code flow to remove an album is like this: ext4_get_group_no_and_offset(sb, pa->pa_pstart, &grp, NULL); ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); so it's critical that we get the right group number back for this prealloc context, to lock the right group (the one associated with this pa) and prevent concurrent list manipulation. however, ext4_mb_put_pa() passes in (pa->pa_pstart - 1) with a comment, "-1 is to protect from crossing allocation group". This makes sense for the group_pa, where pa_pstart is advanced by the length which has been used (in ext4_mb_release_context()), and when the entire length has been used, pa_pstart has been advanced to the first block of the next group. However, for inode_pa, pa_pstart is never advanced; it's just set once to the first block in the group and not moved after that. So in this case, if we subtract one in ext4_mb_put_pa(), we are actually locking the *previous* group, and opening the race with the other threads which do not subtract off the extra block. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 41f4348b62f5..9f61e62f435f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3589,6 +3589,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; + ext4_fsblk_t grp_blk; if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) return; @@ -3603,8 +3604,12 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); - /* -1 is to protect from crossing allocation group */ - ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); + grp_blk = pa->pa_pstart; + /* If linear, pa_pstart may be in the next group when pa is used up */ + if (pa->pa_linear) + grp_blk--; + + ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); /* * possible race: -- cgit v1.2.2 From ee568b25ee9e160b32d1aef73d8b2ee9c05d34db Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Mar 2009 10:02:35 -0700 Subject: Avoid 64-bit "switch()" statements on 32-bit architectures Commit ee6f779b9e0851e2f7da292a9f58e0095edf615a ("filp->f_pos not correctly updated in proc_task_readdir") changed the proc code to use filp->f_pos directly, rather than through a temporary variable. In the process, that caused the operations to be done on the full 64 bits, even though the offset is never that big. That's all fine and dandy per se, but for some unfathomable reason gcc generates absolutely horrid code when using 64-bit values in switch() statements. To the point of actually calling out to gcc helper functions like __cmpdi2 rather than just doing the trivial comparisons directly the way gcc does for normal compares. At which point we get link failures, because we really don't want to support that kind of crazy code. Fix this by just casting the f_pos value to "unsigned long", which is plenty big enough for /proc, and avoids the gcc code generation issue. Reported-by: Alexey Dobriyan Cc: Zhang Le Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index cc6ea2329e71..beaa0ce3b82e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3082,7 +3082,7 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi goto out_no_task; retval = 0; - switch (filp->f_pos) { + switch ((unsigned long)filp->f_pos) { case 0: ino = inode->i_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) -- cgit v1.2.2 From 84f09f46b4ee9e4e9b6381f8af31817516d2091b Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 4 Mar 2009 23:05:35 +0200 Subject: NFSD: provide encode routine for OP_OPENATTR Although this operation is unsupported by our implementation we still need to provide an encode routine for it to merely encode its (error) status back in the compound reply. Thanks for Bill Baker at sun.com for testing with the Sun OpenSolaris' client, finding, and reporting this bug at Connectathon 2009. This bug was introduced in 2.6.27 Signed-off-by: Benny Halevy Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f65953be39c0..9250067943d8 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2596,6 +2596,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, -- cgit v1.2.2 From a8e7d49aa7be728c4ae241a75a2a124cdcabc0c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 19 Mar 2009 11:32:05 -0700 Subject: Fix race in create_empty_buffers() vs __set_page_dirty_buffers() Nick Piggin noticed this (very unlikely) race between setting a page dirty and creating the buffers for it - we need to hold the mapping private_lock until we've set the page dirty bit in order to make sure that create_empty_buffers() might not build up a set of buffers without the dirty bits set when the page is dirty. I doubt anybody has ever hit this race (and it didn't solve the issue Nick was looking at), but as Nick says: "Still, it does appear to solve a real race, which we should close." Acked-by: Nick Piggin Signed-off-by: Linus Torvalds --- fs/buffer.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 9f697419ed8e..891e1c78e4f1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -760,15 +760,9 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. */ -static int __set_page_dirty(struct page *page, +static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - if (TestSetPageDirty(page)) - return 0; - spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); @@ -785,8 +779,6 @@ static int __set_page_dirty(struct page *page, } spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - - return 1; } /* @@ -816,6 +808,7 @@ static int __set_page_dirty(struct page *page, */ int __set_page_dirty_buffers(struct page *page) { + int newly_dirty; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -831,9 +824,12 @@ int __set_page_dirty_buffers(struct page *page) bh = bh->b_this_page; } while (bh != head); } + newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - return __set_page_dirty(page, mapping, 1); + if (newly_dirty) + __set_page_dirty(page, mapping, 1); + return newly_dirty; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1262,8 +1258,11 @@ void mark_buffer_dirty(struct buffer_head *bh) return; } - if (!test_set_buffer_dirty(bh)) - __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); + if (!test_set_buffer_dirty(bh)) { + struct page *page = bh->b_page; + if (!TestSetPageDirty(page)) + __set_page_dirty(page, page_mapping(page), 0); + } } /* -- cgit v1.2.2 From 87c3a86e1c220121d0ced59d1a71e78ed9abc6dd Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 18 Mar 2009 17:04:19 -0700 Subject: eventfd: remove fput() call from possible IRQ context Remove a source of fput() call from inside IRQ context. Myself, like Eric, wasn't able to reproduce an fput() call from IRQ context, but Jeff said he was able to, with the attached test program. Independently from this, the bug is conceptually there, so we might be better off fixing it. This patch adds an optimization similar to the one we already do on ->ki_filp, on ->ki_eventfd. Playing with ->f_count directly is not pretty in general, but the alternative here would be to add a brand new delayed fput() infrastructure, that I'm not sure is worth it. Signed-off-by: Davide Libenzi Cc: Benjamin LaHaise Cc: Trond Myklebust Cc: Eric Dumazet Signed-off-by: Jeff Moyer Cc: Zach Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 8fa77e233944..4a9d4d641fb9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -443,7 +443,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) req->private = NULL; req->ki_iovec = NULL; INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = ERR_PTR(-EINVAL); + req->ki_eventfd = NULL; /* Check if the completion queue has enough free space to * accept an event from this io. @@ -485,8 +485,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) { assert_spin_locked(&ctx->ctx_lock); - if (!IS_ERR(req->ki_eventfd)) - fput(req->ki_eventfd); if (req->ki_dtor) req->ki_dtor(req); if (req->ki_iovec != &req->ki_inline_vec) @@ -508,8 +506,11 @@ static void aio_fput_routine(struct work_struct *data) list_del(&req->ki_list); spin_unlock_irq(&fput_lock); - /* Complete the fput */ - __fput(req->ki_filp); + /* Complete the fput(s) */ + if (req->ki_filp != NULL) + __fput(req->ki_filp); + if (req->ki_eventfd != NULL) + __fput(req->ki_eventfd); /* Link the iocb into the context's free list */ spin_lock_irq(&ctx->ctx_lock); @@ -527,12 +528,14 @@ static void aio_fput_routine(struct work_struct *data) */ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) { + int schedule_putreq = 0; + dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", req, atomic_long_read(&req->ki_filp->f_count)); assert_spin_locked(&ctx->ctx_lock); - req->ki_users --; + req->ki_users--; BUG_ON(req->ki_users < 0); if (likely(req->ki_users)) return 0; @@ -540,10 +543,23 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) req->ki_cancel = NULL; req->ki_retry = NULL; - /* Must be done under the lock to serialise against cancellation. - * Call this aio_fput as it duplicates fput via the fput_work. + /* + * Try to optimize the aio and eventfd file* puts, by avoiding to + * schedule work in case it is not __fput() time. In normal cases, + * we would not be holding the last reference to the file*, so + * this function will be executed w/out any aio kthread wakeup. */ - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) + schedule_putreq++; + else + req->ki_filp = NULL; + if (req->ki_eventfd != NULL) { + if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count))) + schedule_putreq++; + else + req->ki_eventfd = NULL; + } + if (unlikely(schedule_putreq)) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); @@ -1009,7 +1025,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (!IS_ERR(iocb->ki_eventfd)) + if (iocb->ki_eventfd != NULL) eventfd_signal(iocb->ki_eventfd, 1); put_rq: @@ -1608,6 +1624,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); + req->ki_eventfd = NULL; goto out_put_req; } } -- cgit v1.2.2 From 65c24491b4fef017c64e39ec64384fde5e05e0a0 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 18 Mar 2009 17:04:21 -0700 Subject: aio: lookup_ioctx can return the wrong value when looking up a bogus context The libaio test harness turned up a problem whereby lookup_ioctx on a bogus io context was returning the 1 valid io context from the list (harness/cases/3.p). Because of that, an extra put_iocontext was done, and when the process exited, it hit a BUG_ON in the put_iocontext macro called from exit_aio (since we expect a users count of 1 and instead get 0). The problem was introduced by "aio: make the lookup_ioctx() lockless" (commit abf137dd7712132ee56d5b3143c2ff61a72a5faa). Thanks to Zach for pointing out that hlist_for_each_entry_rcu will not return with a NULL tpos at the end of the loop, even if the entry was not found. Signed-off-by: Jeff Moyer Acked-by: Zach Brown Acked-by: Jens Axboe Cc: Benjamin LaHaise Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 4a9d4d641fb9..76da12537956 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -587,7 +587,7 @@ int aio_put_req(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct mm_struct *mm = current->mm; - struct kioctx *ctx = NULL; + struct kioctx *ctx, *ret = NULL; struct hlist_node *n; rcu_read_lock(); @@ -595,12 +595,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { if (ctx->user_id == ctx_id && !ctx->dead) { get_ioctx(ctx); + ret = ctx; break; } } rcu_read_unlock(); - return ctx; + return ret; } /* -- cgit v1.2.2 From 8faece5f906725c10e7a1f6caf84452abadbdc7b Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Mar 2009 01:25:09 -0500 Subject: eCryptfs: Allocate a variable number of pages for file headers When allocating the memory used to store the eCryptfs header contents, a single, zeroed page was being allocated with get_zeroed_page(). However, the size of an eCryptfs header is either PAGE_CACHE_SIZE or ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE (8192), whichever is larger, and is stored in the file's private_data->crypt_stat->num_header_bytes_at_front field. ecryptfs_write_metadata_to_contents() was using num_header_bytes_at_front to decide how many bytes should be written to the lower filesystem for the file header. Unfortunately, at least 8K was being written from the page, despite the chance of the single, zeroed page being smaller than 8K. This resulted in random areas of kernel memory being written between the 0x1000 and 0x1FFF bytes offsets in the eCryptfs file headers if PAGE_SIZE was 4K. This patch allocates a variable number of pages, calculated with num_header_bytes_at_front, and passes the number of allocated pages along to ecryptfs_write_metadata_to_contents(). Thanks to Florian Streibelt for reporting the data leak and working with me to find the problem. 2.6.28 is the only kernel release with this vulnerability. Corresponds to CVE-2009-0787 Signed-off-by: Tyler Hicks Acked-by: Dustin Kirkland Reviewed-by: Eric Sandeen Reviewed-by: Eugene Teo Cc: Greg KH Cc: dann frazier Cc: Serge E. Hallyn Cc: Florian Streibelt Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index bdca1f4b3a3e..75bee99de0f6 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1324,14 +1324,13 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, - struct dentry *ecryptfs_dentry, - char *virt) +ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, + char *virt, size_t virt_len) { int rc; rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, - 0, crypt_stat->num_header_bytes_at_front); + 0, virt_len); if (rc) printk(KERN_ERR "%s: Error attempting to write header " "information to lower file; rc = [%d]\n", __func__, @@ -1341,7 +1340,6 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, static int ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, - struct ecryptfs_crypt_stat *crypt_stat, char *page_virt, size_t size) { int rc; @@ -1351,6 +1349,17 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, return rc; } +static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (page) + return (unsigned long) page_address(page); + return 0; +} + /** * ecryptfs_write_metadata * @ecryptfs_dentry: The eCryptfs dentry @@ -1367,7 +1376,9 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + unsigned int order; char *virt; + size_t virt_len; size_t size = 0; int rc = 0; @@ -1383,33 +1394,35 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = -EINVAL; goto out; } + virt_len = crypt_stat->num_header_bytes_at_front; + order = get_order(virt_len); /* Released in this function */ - virt = (char *)get_zeroed_page(GFP_KERNEL); + virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); if (!virt) { printk(KERN_ERR "%s: Out of memory\n", __func__); rc = -ENOMEM; goto out; } - rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size, - crypt_stat, ecryptfs_dentry); + rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat, + ecryptfs_dentry); if (unlikely(rc)) { printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", __func__, rc); goto out_free; } if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, - crypt_stat, virt, size); + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, + size); else - rc = ecryptfs_write_metadata_to_contents(crypt_stat, - ecryptfs_dentry, virt); + rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " "rc = [%d]\n", __func__, rc); goto out_free; } out_free: - free_page((unsigned long)virt); + free_pages((unsigned long)virt, order); out: return rc; } -- cgit v1.2.2 From 2aac0cf88681bfa092f731553bc7fbd23516be73 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Mar 2009 02:23:57 -0500 Subject: eCryptfs: NULL crypt_stat dereference during lookup If ecryptfs_encrypted_view or ecryptfs_xattr_metadata were being specified as mount options, a NULL pointer dereference of crypt_stat was possible during lookup. This patch moves the crypt_stat assignment into ecryptfs_lookup_and_interpose_lower(), ensuring that crypt_stat will not be NULL before we attempt to dereference it. Thanks to Dan Carpenter and his static analysis tool, smatch, for finding this bug. Signed-off-by: Tyler Hicks Acked-by: Dustin Kirkland Cc: Dan Carpenter Cc: Serge Hallyn Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 10 ++++++---- fs/ecryptfs/ecryptfs_kernel.h | 1 - fs/ecryptfs/inode.c | 32 ++++++++++++-------------------- 3 files changed, 18 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 75bee99de0f6..8b65f289ee00 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -2221,17 +2221,19 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, struct dentry *ecryptfs_dir_dentry, const char *name, size_t name_size) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; char *decoded_name; size_t decoded_name_size; size_t packet_size; int rc = 0; - if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { - struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private( - ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; const char *orig_name = name; size_t orig_name_size = name_size; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index eb2267eca1fe..ac749d4d644f 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -620,7 +620,6 @@ int ecryptfs_interpose(struct dentry *hidden_dentry, u32 flags); int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct ecryptfs_crypt_stat *crypt_stat, struct inode *ecryptfs_dir_inode, struct nameidata *ecryptfs_nd); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5697899a168d..55b3145b8072 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -246,7 +246,6 @@ out: */ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct ecryptfs_crypt_stat *crypt_stat, struct inode *ecryptfs_dir_inode, struct nameidata *ecryptfs_nd) { @@ -254,6 +253,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct vfsmount *lower_mnt; struct inode *lower_inode; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; char *page_virt = NULL; u64 file_size; int rc = 0; @@ -314,6 +314,11 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, goto out_free_kmem; } } + crypt_stat = &ecryptfs_inode_to_private( + ecryptfs_dentry->d_inode)->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); rc = ecryptfs_read_and_validate_header_region(page_virt, ecryptfs_dentry->d_inode); if (rc) { @@ -362,9 +367,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, { char *encrypted_and_encoded_name = NULL; size_t encrypted_and_encoded_name_size; - struct ecryptfs_crypt_stat *crypt_stat = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - struct ecryptfs_inode_info *inode_info; struct dentry *lower_dir_dentry, *lower_dentry; int rc = 0; @@ -388,26 +391,15 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, } if (lower_dentry->d_inode) goto lookup_and_interpose; - inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); - if (inode_info) { - crypt_stat = &inode_info->crypt_stat; - /* TODO: lock for crypt_stat comparison */ - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) - ecryptfs_set_default_sizes(crypt_stat); - } - if (crypt_stat) - mount_crypt_stat = crypt_stat->mount_crypt_stat; - else - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; - if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) - && !(mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(mount_crypt_stat + && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) goto lookup_and_interpose; dput(lower_dentry); rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, - crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, + NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, ecryptfs_dentry->d_name.len); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " @@ -426,7 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, } lookup_and_interpose: rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, - crypt_stat, ecryptfs_dir_inode, + ecryptfs_dir_inode, ecryptfs_nd); goto out; out_d_drop: -- cgit v1.2.2 From f762dd68218665bb87d4e4a0eeac86fde7530293 Mon Sep 17 00:00:00 2001 From: Gertjan van Wingerde Date: Sat, 21 Mar 2009 23:18:57 +0100 Subject: Update my email address Update all previous incarnations of my email address to the correct one. Signed-off-by: Gertjan van Wingerde Signed-off-by: Linus Torvalds --- fs/minix/inode.c | 2 +- fs/ufs/super.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/minix/inode.c b/fs/minix/inode.c index d1d1eb84679d..618865b3128b 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -3,7 +3,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Copyright (C) 1996 Gertjan van Wingerde (gertjan@cs.vu.nl) + * Copyright (C) 1996 Gertjan van Wingerde * Minix V2 fs support. * * Modified for 680x0 by Andreas Schwab diff --git a/fs/ufs/super.c b/fs/ufs/super.c index e65212dfb60e..261a1c2f22dd 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -41,7 +41,7 @@ * Stefan Reinauer * * Module usage counts added on 96/04/29 by - * Gertjan van Wingerde + * Gertjan van Wingerde * * Clean swab support on 19970406 by * Francois-Rene Rideau -- cgit v1.2.2