From 7eaebe7d503c3ef240ac7b3efc5433fe647c0298 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Wed, 21 Jan 2009 10:49:16 -0500 Subject: Btrfs: removed unused #include 's Removed unused #include 's in btrfs Signed-off-by: Huang Weiyi Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8adfe059ab41..44dbd550c4bd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.2 From c6e308713a47527f88a277ee95b7c5d1db80f77b Mon Sep 17 00:00:00 2001 From: Qinghuang Feng Date: Wed, 21 Jan 2009 10:59:08 -0500 Subject: Btrfs: simplify iteration codes Merge list_for_each* and list_entry to list_for_each_entry* Signed-off-by: Qinghuang Feng Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 44dbd550c4bd..45cf03ee1bc2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1323,12 +1323,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_offset, struct list_head *list) { - struct list_head *cur; struct btrfs_ordered_sum *sum; btrfs_set_trans_block_group(trans, inode); - list_for_each(cur, list) { - sum = list_entry(cur, struct btrfs_ordered_sum, list); + + list_for_each_entry(sum, list, list) { btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); } -- cgit v1.2.2 From 35054394c4b3cecd52577c2662c84da1f3e73525 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 21 Jan 2009 13:11:13 -0500 Subject: Btrfs: stop providing a bmap operation to avoid swapfile corruptions Swapfiles use bmap to build a list of extents belonging to the file, and they assume these extents won't change over the life of the file. They also use resulting list to do IO directly to the block device. This causes problems for btrfs in a few ways: btrfs returns logical block numbers through bmap, and these are not suitable for IO. They might translate to different devices, raid etc. COW means that file block mappings are going to change frequently. Using swapfiles on btrfs will lead to corruption, so we're avoiding the problem for now by dropping bmap support entirely. A later commit will add fiemap support for people that really want to know how a file is laid out. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 45cf03ee1bc2..2e25d698bab0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4156,11 +4156,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, return -EINVAL; } -static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) -{ - return extent_bmap(mapping, iblock, btrfs_get_extent); -} - int btrfs_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; @@ -4985,13 +4980,24 @@ static struct extent_io_ops btrfs_extent_io_ops = { .clear_bit_hook = btrfs_clear_bit_hook, }; +/* + * btrfs doesn't support the bmap operation because swapfiles + * use bmap to make a mapping of extents in the file. They assume + * these extents won't change over the life of the file and they + * use the bmap result to do IO directly to the drive. + * + * the btrfs bmap call would return logical addresses that aren't + * suitable for IO and they also will change frequently as COW + * operations happen. So, swapfile + btrfs == corruption. + * + * For now we're avoiding this by dropping bmap. + */ static struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readpages = btrfs_readpages, .sync_page = block_sync_page, - .bmap = btrfs_bmap, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, -- cgit v1.2.2 From 1506fcc8189cdd4b95e06df7845a09f18b4526a6 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 21 Jan 2009 14:39:14 -0500 Subject: Btrfs: fiemap support Now that bmap support is gone, this is the only way to get extent mappings for userland. These are still not valid for IO, but they can tell us if a file has holes or how much fragmentation there is. Signed-off-by: Yehuda Sadeh --- fs/btrfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2e25d698bab0..288c2cdc7543 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4156,6 +4156,12 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, return -EINVAL; } +static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); +} + int btrfs_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; @@ -5021,6 +5027,7 @@ static struct inode_operations btrfs_file_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .fallocate = btrfs_fallocate, + .fiemap = btrfs_fiemap, }; static struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, -- cgit v1.2.2 From 89f135d8b53bcccafd91a075366d2704ba257cf3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 28 Jan 2009 15:34:27 -0500 Subject: Btrfs: fix readdir on 32 bit machines After btrfs_readdir has gone through all the directory items, it sets the directory f_pos to the largest possible int. This way applications that mix readdir with creating new files don't end up in an endless loop finding the new directory items as they go. It was a workaround for a bug in git, but the assumption was that if git could make this looping mistake than it would be a common problem. The largest possible int chosen was INT_LIMIT(typeof(file->f_pos), and it is possible for that to be a larger number than 32 bit glibc expects to come out of readdir. This patches switches that to INT_LIMIT(off_t), which should keep applications happy on 32 and 64 bit machines. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 288c2cdc7543..2bb65e9b1448 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3263,7 +3263,7 @@ skip: /* Reached end of directory/root. Bump pos past the last item. */ if (key_type == BTRFS_DIR_INDEX_KEY) - filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); + filp->f_pos = INT_LIMIT(off_t); else filp->f_pos++; nopos: -- cgit v1.2.2 From 0279b4cd86685b5eea467c1b74ce94f0add2c0a3 Mon Sep 17 00:00:00 2001 From: Jim Owens Date: Wed, 4 Feb 2009 09:29:13 -0500 Subject: Btrfs: selinux support Add call to LSM security initialization and save resulting security xattr for new inodes. Add xattr support to symlink inode ops. Set inode->i_op for existing special files. Signed-off-by: jim owens --- fs/btrfs/inode.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2bb65e9b1448..4a79e1c5ebd0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -90,6 +90,16 @@ static noinline int cow_file_range(struct inode *inode, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) +{ + int err; + + err = btrfs_init_acl(inode, dir); + if (!err) + err = btrfs_xattr_security_init(inode, dir); + return err; +} + /* * a very lame attempt at stopping writes when the FS is 85% full. There * are countless ways this is incorrect, but it is better than nothing. @@ -2037,6 +2047,7 @@ void btrfs_read_locked_inode(struct inode *inode) inode->i_mapping->backing_dev_info = &root->fs_info->bdi; break; default: + inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } @@ -3584,7 +3595,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -3647,7 +3658,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -3770,7 +3781,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) goto out_fail; @@ -4732,7 +4743,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -5043,4 +5054,8 @@ static struct inode_operations btrfs_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, }; -- cgit v1.2.2 From b4ce94de9b4d64e8ab3cf155d13653c666e22b9b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:25:08 -0500 Subject: Btrfs: Change btree locking to use explicit blocking points Most of the btrfs metadata operations can be protected by a spinlock, but some operations still need to schedule. So far, btrfs has been using a mutex along with a trylock loop, most of the time it is able to avoid going for the full mutex, so the trylock loop is a big performance gain. This commit is step one for getting rid of the blocking locks entirely. btrfs_tree_lock takes a spinlock, and the code explicitly switches to a blocking lock when it starts an operation that can schedule. We'll be able get rid of the blocking locks in smaller pieces over time. Tracing allows us to find the most common cause of blocking, so we can start with the hot spots first. The basic idea is: btrfs_tree_lock() returns with the spin lock held btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in the extent buffer flags, and then drops the spin lock. The buffer is still considered locked by all of the btrfs code. If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops the spin lock and waits on a wait queue for the blocking bit to go away. Much of the code that needs to set the blocking bit finishes without actually blocking a good percentage of the time. So, an adaptive spin is still used against the blocking bit to avoid very high context switch rates. btrfs_clear_lock_blocking() clears the blocking bit and returns with the spinlock held again. btrfs_tree_unlock() can be called on either blocking or spinning locks, it does the right thing based on the blocking bit. ctree.c has a helper function to set/clear all the locked buffers in a path as blocking. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4a79e1c5ebd0..ebd7d6c37df8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "tree-log.h" #include "ref-cache.h" #include "compression.h" +#include "locking.h" struct btrfs_iget_args { u64 ino; @@ -2021,6 +2022,7 @@ void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); @@ -2117,6 +2119,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, goto failed; } + btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); -- cgit v1.2.2 From bd56b30205bc09da0beb80d4ba3d4c7309792da5 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:27:02 -0500 Subject: Btrfs: Make btrfs_drop_snapshot work in larger and more efficient chunks Every transaction in btrfs creates a new snapshot, and then schedules the snapshot from the last transaction for deletion. Snapshot deletion works by walking down the btree and dropping the reference counts on each btree block during the walk. If if a given leaf or node has a reference count greater than one, the reference count is decremented and the subtree pointed to by that node is ignored. If the reference count is one, walking continues down into that node or leaf, and the references of everything it points to are decremented. The old code would try to work in small pieces, walking down the tree until it found the lowest leaf or node to free and then returning. This was very friendly to the rest of the FS because it didn't have a huge impact on other operations. But it wouldn't always keep up with the rate that new commits added new snapshots for deletion, and it wasn't very optimal for the extent allocation tree because it wasn't finding leaves that were close together on disk and processing them at the same time. This changes things to walk down to a level 1 node and then process it in bulk. All the leaf pointers are sorted and the leaves are dropped in order based on their extent number. The extent allocation tree and commit code are now fast enough for this kind of bulk processing to work without slowing the rest of the FS down. Overall it does less IO and is better able to keep up with snapshot deletions under high load. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ebd7d6c37df8..95ea58cb3065 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2441,6 +2441,8 @@ next_node: ref->generation = leaf_gen; ref->nritems = 0; + btrfs_sort_leaf_ref(ref); + ret = btrfs_add_leaf_ref(root, ref, 0); WARN_ON(ret); btrfs_free_leaf_ref(root, ref); -- cgit v1.2.2 From 8c087b5183adab186a298f2d6ed39aefdcae413c Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Wed, 4 Feb 2009 09:29:54 -0500 Subject: Btrfs: Handle SGID bit when creating inodes Before this patch, new files/dirs would ignore the SGID bit on their parent directory and always be owned by the creating user's uid/gid. Signed-off-by: Chris Ball Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95ea58cb3065..5792816d4fca 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3472,7 +3472,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, root->highest_inode = objectid; inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; inode->i_ino = objectid; inode_set_bytes(inode, 0); -- cgit v1.2.2 From f03d9301f15fb69cdf1eb59d53c9fb72f68ecccc Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:31:06 -0500 Subject: Btrfs: Don't try to compress pages past i_size The compression code had some checks to make sure we were only compressing bytes inside of i_size, but it wasn't catching every case. To make things worse, some incorrect math about the number of bytes remaining would make it try to compress more pages than the file really had. The fix used here is to fall back to the non-compression code in this case, which does all the proper cleanup of delalloc and other accounting. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5792816d4fca..9b43a6f303b8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -360,6 +360,19 @@ again: nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + /* + * we don't want to send crud past the end of i_size through + * compression, that's just a waste of CPU time. So, if the + * end of the file is before the start of our current + * requested range of bytes, we bail out to the uncompressed + * cleanup code that can deal with all of this. + * + * It isn't really the fastest way to fix things, but this is a + * very uncommon corner. + */ + if (actual_end <= start) + goto cleanup_and_bail_uncompressed; + total_compressed = actual_end - start; /* we want to make sure that amount of ram required to uncompress @@ -504,6 +517,7 @@ again: goto again; } } else { +cleanup_and_bail_uncompressed: /* * No compression, but we still need to write the pages in * the file we've been given so far. redirty the locked -- cgit v1.2.2 From 06d9a8d7c24fe22836bf0b0f82db59d6f98e271e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 4 Feb 2009 09:30:58 -0500 Subject: Btrfs: Change btrfs_truncate_inode_items to stop when it hits the inode btrfs_truncate_inode_items is setup to stop doing btree searches when it has finished removing the items for the inode. It used to detect the end of the inode by looking for an objectid that didn't match the one we were searching for. But, this would result in an extra search through the btree, which adds extra balancing and cow costs to the operation. This commit adds a check to see if we found the inode item, which means we can stop searching early. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9b43a6f303b8..ddb0f0ecda6c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2504,7 +2504,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - u32 found_type; + u32 found_type = (u8)-1; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; u64 extent_start = 0; @@ -2691,6 +2691,8 @@ next: if (pending_del_nr) goto del_pending; btrfs_release_path(root, path); + if (found_type == BTRFS_INODE_ITEM_KEY) + break; goto search_again; } @@ -2707,6 +2709,8 @@ del_pending: BUG_ON(ret); pending_del_nr = 0; btrfs_release_path(root, path); + if (found_type == BTRFS_INODE_ITEM_KEY) + break; goto search_again; } } -- cgit v1.2.2 From 42f15d77df8a7e8a2feb15041d5d30710ee7f951 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 6 Feb 2009 11:35:57 -0500 Subject: Btrfs: Make sure dir is non-null before doing S_ISGID checks The S_ISGID check in btrfs_new_inode caused an oops during subvol creation because sometimes the dir is null. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ddb0f0ecda6c..8f0706210a47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3491,7 +3491,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { + if (dir && (dir->i_mode & S_ISGID)) { inode->i_gid = dir->i_gid; if (S_ISDIR(mode)) mode |= S_ISGID; -- cgit v1.2.2