From f46b5a66b3316ef2f8febfe4c56e2d555e2c3979 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jun 2008 21:53:53 -0400 Subject: Btrfs: split out ioctl.c Split the ioctl handling out of inode.c into a file of it's own. Also fix up checkpatch.pl warnings for the moved code. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 781 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 781 insertions(+) create mode 100644 fs/btrfs/ioctl.c (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 000000000000..da8de6cfdb5a --- /dev/null +++ b/fs/btrfs/ioctl.c @@ -0,0 +1,781 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" + + + +static noinline int create_subvol(struct btrfs_root *root, char *name, + int namelen) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + struct inode *dir; + int ret; + int err; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + unsigned long nr = 1; + + mutex_lock(&root->fs_info->fs_mutex); + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_commit; + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, + 0, &objectid); + if (ret) + goto fail; + + leaf = __btrfs_alloc_free_block(trans, root, root->leafsize, + objectid, trans->transid, 0, 0, + 0, 0); + if (IS_ERR(leaf)) + return PTR_ERR(leaf); + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, objectid); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nblocks = cpu_to_le64(1); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_used(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + if (ret) + goto fail; + + /* + * insert the directory item + */ + key.offset = (u64)-1; + dir = root->fs_info->sb->s_root->d_inode; + ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, + name, namelen, dir->i_ino, &key, + BTRFS_FT_DIR); + if (ret) + goto fail; + + ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root, + name, namelen, objectid, + root->fs_info->sb->s_root->d_inode->i_ino); + if (ret) + goto fail; + + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto fail_commit; + + new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen); + BUG_ON(!new_root); + + trans = btrfs_start_transaction(new_root, 1); + BUG_ON(!trans); + + ret = btrfs_create_subvol_root(new_root, trans, new_dirid, + BTRFS_I(dir)->block_group); + if (ret) + goto fail; + + /* Invalidate existing dcache entry for new subvolume. */ + btrfs_invalidate_dcache_root(root, name, namelen); + +fail: + nr = trans->blocks_used; + err = btrfs_commit_transaction(trans, new_root); + if (err && !ret) + ret = err; +fail_commit: + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root, nr); + btrfs_throttle(root); + return ret; +} + +static int create_snapshot(struct btrfs_root *root, char *name, int namelen) +{ + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; + int ret; + int err; + unsigned long nr = 0; + + if (!root->ref_cows) + return -EINVAL; + + mutex_lock(&root->fs_info->fs_mutex); + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_unlock; + + pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; + goto fail_unlock; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); + goto fail_unlock; + } + memcpy(pending_snapshot->name, name, namelen); + pending_snapshot->name[namelen] = '\0'; + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + pending_snapshot->root = root; + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); + ret = btrfs_update_inode(trans, root, root->inode); + err = btrfs_commit_transaction(trans, root); + +fail_unlock: + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root, nr); + btrfs_throttle(root); + return ret; +} + +int btrfs_defrag_file(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct page *page; + unsigned long last_index; + unsigned long ra_pages = root->fs_info->bdi.ra_pages; + unsigned long total_read = 0; + u64 page_start; + u64 page_end; + unsigned long i; + int ret; + + mutex_lock(&root->fs_info->fs_mutex); + ret = btrfs_check_free_space(root, inode->i_size, 0); + mutex_unlock(&root->fs_info->fs_mutex); + if (ret) + return -ENOSPC; + + mutex_lock(&inode->i_mutex); + last_index = inode->i_size >> PAGE_CACHE_SHIFT; + for (i = 0; i <= last_index; i++) { + if (total_read % ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, + min(last_index, i + ra_pages - 1)); + } + total_read++; + page = grab_cache_page(inode->i_mapping, i); + if (!page) + goto out_unlock; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + goto out_unlock; + } + } + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) + ClearPageDirty(page); +#else + cancel_dirty_page(page, PAGE_CACHE_SIZE); +#endif + wait_on_page_writeback(page); + set_page_extent_mapped(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_extent_delalloc(io_tree, page_start, + page_end, GFP_NOFS); + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); + } + +out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +/* + * Called inside transaction, so use GFP_NOFS + */ + +static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) +{ + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *devstr = NULL; + int ret = 0; + int namelen; + int mod = 0; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + namelen = strlen(vol_args->name); + if (namelen > BTRFS_VOL_NAME_MAX) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&root->fs_info->fs_mutex); + sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + char *end; + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + devid = simple_strtoull(devstr, &end, 10); + printk(KERN_INFO "resizing devid %llu\n", devid); + } + device = btrfs_find_device(root, devid, NULL); + if (!device) { + printk(KERN_INFO "resizer unable to find device %llu\n", devid); + ret = -EINVAL; + goto out_unlock; + } + if (!strcmp(sizestr, "max")) + new_size = device->bdev->bd_inode->i_size; + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = btrfs_parse_size(sizestr); + if (new_size == 0) { + ret = -EINVAL; + goto out_unlock; + } + } + + old_size = device->total_bytes; + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_unlock; + } + new_size = old_size - new_size; + } else if (mod > 0) { + new_size = old_size + new_size; + } + + if (new_size < 256 * 1024 * 1024) { + ret = -EINVAL; + goto out_unlock; + } + if (new_size > device->bdev->bd_inode->i_size) { + ret = -EFBIG; + goto out_unlock; + } + + do_div(new_size, root->sectorsize); + new_size *= root->sectorsize; + + printk(KERN_INFO "new size for %s is %llu\n", + device->name, (unsigned long long)new_size); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_shrink_device(device, new_size); + } + +out_unlock: + mutex_unlock(&root->fs_info->fs_mutex); +out: + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_dir_item *di; + struct btrfs_path *path; + u64 root_dirid; + int namelen; + int ret; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + + namelen = strlen(vol_args->name); + if (namelen > BTRFS_VOL_NAME_MAX) { + ret = -EINVAL; + goto out; + } + if (strchr(vol_args->name, '/')) { + ret = -EINVAL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, + mutex_lock(&root->fs_info->fs_mutex); + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, + path, root_dirid, + vol_args->name, namelen, 0); + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_free_path(path); + + if (di && !IS_ERR(di)) { + ret = -EEXIST; + goto out; + } + + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + if (root == root->fs_info->tree_root) + ret = create_subvol(root, vol_args->name, namelen); + else + ret = create_snapshot(root, vol_args->name, namelen); +out: + kfree(vol_args); + return ret; +} + +static int btrfs_ioctl_defrag(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + mutex_lock(&root->fs_info->fs_mutex); + btrfs_defrag_root(root, 0); + btrfs_defrag_root(root->fs_info->extent_root, 0); + mutex_unlock(&root->fs_info->fs_mutex); + break; + case S_IFREG: + btrfs_defrag_file(file); + break; + } + + return 0; +} + +long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + ret = btrfs_init_new_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + ret = btrfs_rm_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +int dup_item_to_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf, + int slot, + struct btrfs_key *key, + u64 destino) +{ + char *dup; + int len = btrfs_item_size_nr(leaf, slot); + struct btrfs_key ckey = *key; + int ret = 0; + + dup = kmalloc(len, GFP_NOFS); + if (!dup) + return -ENOMEM; + + read_extent_buffer(leaf, dup, btrfs_item_ptr_offset(leaf, slot), len); + btrfs_release_path(root, path); + + ckey.objectid = destino; + ret = btrfs_insert_item(trans, root, &ckey, dup, len); + kfree(dup); + return ret; +} + +long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + int ret; + u64 pos; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u32 nritems; + int slot; + + src_file = fget(src_fd); + if (!src_file) + return -EBADF; + src = src_file->f_dentry->d_inode; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + ret = -ENOTEMPTY; + if (inode->i_size) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + filemap_write_and_wait(src->i_mapping); + lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + if (BTRFS_I(src)->delalloc_bytes == 0) + break; + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + } + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 0); + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + key.offset = 0; + key.type = BTRFS_EXTENT_DATA_KEY; + key.objectid = src->i_ino; + pos = 0; + path->reada = 2; + + while (1) { + /* + * note the key will change type as we walk through the + * tree. + */ + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + } + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + nritems = btrfs_header_nritems(leaf); + + if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || + key.objectid != src->i_ino) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int found_type; + pos = key.offset; + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + u64 len = btrfs_file_extent_num_bytes(leaf, + extent); + u64 ds = btrfs_file_extent_disk_bytenr(leaf, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + u64 off = btrfs_file_extent_offset(leaf, + extent); + btrfs_insert_file_extent(trans, root, + inode->i_ino, pos, + ds, dl, len, off); + /* ds == 0 means there's a hole */ + if (ds != 0) { + btrfs_inc_extent_ref(trans, root, + ds, dl, + root->root_key.objectid, + trans->transid, + inode->i_ino, pos); + } + pos = key.offset + len; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + ret = dup_item_to_inode(trans, root, path, + leaf, slot, &key, + inode->i_ino); + if (ret) + goto out; + pos = key.offset + btrfs_item_size_nr(leaf, + slot); + } + } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { + ret = dup_item_to_inode(trans, root, path, leaf, + slot, &key, inode->i_ino); + + if (ret) + goto out; + } + key.offset++; + btrfs_release_path(root, path); + } + + ret = 0; +out: + btrfs_free_path(path); + + inode->i_blocks = src->i_blocks; + i_size_write(inode, src->i_size); + btrfs_update_inode(trans, root, inode); + + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); +out_fput: + fput(src_file); + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + mutex_lock(&root->fs_info->fs_mutex); + if (file->private_data) { + ret = -EINPROGRESS; + goto out; + } + trans = btrfs_start_transaction(root, 0); + if (trans) + file->private_data = trans; + else + ret = -ENOMEM; + /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ +out: + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + mutex_lock(&root->fs_info->fs_mutex); + trans = file->private_data; + if (!trans) { + ret = -EINVAL; + goto out; + } + btrfs_end_transaction(trans, root); + file->private_data = 0; +out: + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + + switch (cmd) { + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(root, (void __user *)arg); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(root, (void __user *)arg); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, (void __user *)arg); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(root, (void __user *)arg); + case BTRFS_IOC_BALANCE: + return btrfs_balance(root->fs_info->dev_root); + case BTRFS_IOC_CLONE: + return btrfs_ioctl_clone(file, arg); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + } + + return -ENOTTY; +} -- cgit v1.2.2 From df5b5520b21d8a2554ede65c09b288833c77144d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jun 2008 21:53:58 -0400 Subject: BTRFS_IOC_TRANS_START should be privilegued As mentioned in the comment next to it btrfs_ioctl_trans_start can do bad damage to filesystems and thus should be limited to privilegued users. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index da8de6cfdb5a..6fb455802759 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -706,6 +706,9 @@ long btrfs_ioctl_trans_start(struct file *file) struct btrfs_trans_handle *trans; int ret = 0; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&root->fs_info->fs_mutex); if (file->private_data) { ret = -EINPROGRESS; -- cgit v1.2.2 From 925baeddc5b0764a53f2214a1253251bab0e0324 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jun 2008 16:01:30 -0400 Subject: Btrfs: Start btree concurrency work. The allocation trees and the chunk trees are serialized via their own dedicated mutexes. This means allocation location is still not very fine grained. The main FS btree is protected by locks on each block in the btree. Locks are taken top / down, and as processing finishes on a given level of the tree, the lock is released after locking the lower level. The end result of a search is now a path where only the lowest level is locked. Releasing or freeing the path drops any locks held. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6fb455802759..3fbf74e93dba 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,7 @@ #include "ioctl.h" #include "print-tree.h" #include "volumes.h" +#include "locking.h" @@ -75,9 +76,9 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, if (ret) goto fail; - leaf = __btrfs_alloc_free_block(trans, root, root->leafsize, - objectid, trans->transid, 0, 0, - 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + objectid, trans->transid, 0, 0, + 0, 0); if (IS_ERR(leaf)) return PTR_ERR(leaf); @@ -108,6 +109,7 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); root_item.drop_level = 0; + btrfs_tree_unlock(leaf); free_extent_buffer(leaf); leaf = NULL; -- cgit v1.2.2 From a213501153fd66e2359e091b1612841305ba6551 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jun 2008 16:01:30 -0400 Subject: Btrfs: Replace the big fs_mutex with a collection of other locks Extent alloctions are still protected by a large alloc_mutex. Objectid allocations are covered by a objectid mutex Other btree operations are protected by a lock on individual btree nodes Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3fbf74e93dba..6002eb64daf9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -63,7 +63,6 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; unsigned long nr = 1; - mutex_lock(&root->fs_info->fs_mutex); ret = btrfs_check_free_space(root, 1, 0); if (ret) goto fail_commit; @@ -164,7 +163,6 @@ fail: if (err && !ret) ret = err; fail_commit: - mutex_unlock(&root->fs_info->fs_mutex); btrfs_btree_balance_dirty(root, nr); btrfs_throttle(root); return ret; @@ -181,7 +179,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) if (!root->ref_cows) return -EINVAL; - mutex_lock(&root->fs_info->fs_mutex); ret = btrfs_check_free_space(root, 1, 0); if (ret) goto fail_unlock; @@ -208,7 +205,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) err = btrfs_commit_transaction(trans, root); fail_unlock: - mutex_unlock(&root->fs_info->fs_mutex); btrfs_btree_balance_dirty(root, nr); btrfs_throttle(root); return ret; @@ -228,9 +224,7 @@ int btrfs_defrag_file(struct file *file) unsigned long i; int ret; - mutex_lock(&root->fs_info->fs_mutex); ret = btrfs_check_free_space(root, inode->i_size, 0); - mutex_unlock(&root->fs_info->fs_mutex); if (ret) return -ENOSPC; @@ -315,7 +309,8 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) goto out; } - mutex_lock(&root->fs_info->fs_mutex); + mutex_lock(&root->fs_info->alloc_mutex); + mutex_lock(&root->fs_info->chunk_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { @@ -385,7 +380,8 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) } out_unlock: - mutex_unlock(&root->fs_info->fs_mutex); + mutex_lock(&root->fs_info->alloc_mutex); + mutex_lock(&root->fs_info->chunk_mutex); out: kfree(vol_args); return ret; @@ -428,11 +424,9 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root, } root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, - mutex_lock(&root->fs_info->fs_mutex); di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path, root_dirid, vol_args->name, namelen, 0); - mutex_unlock(&root->fs_info->fs_mutex); btrfs_free_path(path); if (di && !IS_ERR(di)) { @@ -445,10 +439,12 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root, goto out; } + mutex_lock(&root->fs_info->drop_mutex); if (root == root->fs_info->tree_root) ret = create_subvol(root, vol_args->name, namelen); else ret = create_snapshot(root, vol_args->name, namelen); + mutex_unlock(&root->fs_info->drop_mutex); out: kfree(vol_args); return ret; @@ -461,10 +457,8 @@ static int btrfs_ioctl_defrag(struct file *file) switch (inode->i_mode & S_IFMT) { case S_IFDIR: - mutex_lock(&root->fs_info->fs_mutex); btrfs_defrag_root(root, 0); btrfs_defrag_root(root->fs_info->extent_root, 0); - mutex_unlock(&root->fs_info->fs_mutex); break; case S_IFREG: btrfs_defrag_file(file); @@ -588,7 +582,6 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); } - mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 0); path = btrfs_alloc_path(); if (!path) { @@ -685,7 +678,6 @@ out: unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); out_unlock: mutex_unlock(&src->i_mutex); @@ -711,7 +703,6 @@ long btrfs_ioctl_trans_start(struct file *file) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->fs_mutex); if (file->private_data) { ret = -EINPROGRESS; goto out; @@ -723,7 +714,6 @@ long btrfs_ioctl_trans_start(struct file *file) ret = -ENOMEM; /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ out: - mutex_unlock(&root->fs_info->fs_mutex); return ret; } @@ -740,7 +730,6 @@ long btrfs_ioctl_trans_end(struct file *file) struct btrfs_trans_handle *trans; int ret = 0; - mutex_lock(&root->fs_info->fs_mutex); trans = file->private_data; if (!trans) { ret = -EINVAL; @@ -749,7 +738,6 @@ long btrfs_ioctl_trans_end(struct file *file) btrfs_end_transaction(trans, root); file->private_data = 0; out: - mutex_unlock(&root->fs_info->fs_mutex); return ret; } -- cgit v1.2.2 From 89ce8a63d0c761fbb02089850605360f389477d8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jun 2008 16:01:31 -0400 Subject: Add btrfs_end_transaction_throttle to force writers to wait for pending commits The existing throttle mechanism was often not sufficient to prevent new writers from coming in and making a given transaction run forever. This adds an explicit wait at the end of most operations so they will allow the current transaction to close. There is no wait inside file_write, inode updates, or cow filling, all which have different deadlock possibilities. This is a temporary measure until better asynchronous commit support is added. This code leads to stalls as it waits for data=ordered writeback, and it really needs to be fixed. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6002eb64daf9..026039a2ac58 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -164,7 +164,6 @@ fail: ret = err; fail_commit: btrfs_btree_balance_dirty(root, nr); - btrfs_throttle(root); return ret; } @@ -206,7 +205,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) fail_unlock: btrfs_btree_balance_dirty(root, nr); - btrfs_throttle(root); return ret; } -- cgit v1.2.2 From 7d9eb12c8739e7dc80c78c6b3596f912ecd8f941 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 8 Jul 2008 14:19:17 -0400 Subject: Btrfs: Add locking around volume management (device add/remove/balance) Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 026039a2ac58..83f17a5cbd6a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -307,8 +307,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) goto out; } - mutex_lock(&root->fs_info->alloc_mutex); - mutex_lock(&root->fs_info->chunk_mutex); + mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { @@ -378,8 +377,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) } out_unlock: - mutex_lock(&root->fs_info->alloc_mutex); - mutex_lock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->volume_mutex); out: kfree(vol_args); return ret; -- cgit v1.2.2 From 3eaa2885276fd6dac7b076a793932428b7168e74 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 24 Jul 2008 11:57:52 -0400 Subject: Btrfs: Fix the defragmention code and the block relocation code for data=ordered Before setting an extent to delalloc, the code needs to wait for pending ordered extents. Also, the relocation code needs to wait for ordered IO before scanning the block group again. This is because the extents are not removed until the IO for the new extents is finished Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 83f17a5cbd6a..a61f2e7e2db5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -213,6 +213,7 @@ int btrfs_defrag_file(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; struct page *page; unsigned long last_index; unsigned long ra_pages = root->fs_info->bdi.ra_pages; @@ -234,6 +235,7 @@ int btrfs_defrag_file(struct file *file) min(last_index, i + ra_pages - 1)); } total_read++; +again: page = grab_cache_page(inode->i_mapping, i); if (!page) goto out_unlock; @@ -247,18 +249,23 @@ int btrfs_defrag_file(struct file *file) } } -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - ClearPageDirty(page); -#else - cancel_dirty_page(page, PAGE_CACHE_SIZE); -#endif wait_on_page_writeback(page); - set_page_extent_mapped(page); page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + set_extent_delalloc(io_tree, page_start, page_end, GFP_NOFS); -- cgit v1.2.2 From aec7477b3b0e8ec93f6d274f25ba40b0665134d4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jul 2008 12:12:38 -0400 Subject: Btrfs: Implement new dir index format Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a61f2e7e2db5..faf081302d02 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -129,13 +129,13 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, dir = root->fs_info->sb->s_root->d_inode; ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, name, namelen, dir->i_ino, &key, - BTRFS_FT_DIR); + BTRFS_FT_DIR, 0); if (ret) goto fail; ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root, name, namelen, objectid, - root->fs_info->sb->s_root->d_inode->i_ino); + root->fs_info->sb->s_root->d_inode->i_ino, 0); if (ret) goto fail; -- cgit v1.2.2 From 8e8a1e31f2780b7865d40a8c5142a04f2bcdcb86 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jul 2008 12:17:14 -0400 Subject: Btrfs: Fix a few functions that exit without stopping their transaction Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index faf081302d02..7d40778a90e4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -78,8 +78,10 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, leaf = btrfs_alloc_free_block(trans, root, root->leafsize, objectid, trans->transid, 0, 0, 0, 0); - if (IS_ERR(leaf)) - return PTR_ERR(leaf); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } btrfs_set_header_nritems(leaf, 0); btrfs_set_header_level(leaf, 0); -- cgit v1.2.2 From 5516e5957f4b99b19fffffa53bf9fbe7cc793249 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 24 Jul 2008 12:20:14 -0400 Subject: Btrfs: Null terminate strings passed in from userspace The 'char name[BTRFS_PATH_NAME_MAX]' member of struct btrfs_ioctl_vol_args is passed directly to strlen() after being copied from user. I haven't verified this, but in theory a userspace program could pass in an unterminated string and cause a kernel crash as strlen walks off the end of the array. This patch terminates the ->name string in all btrfs ioctl functions which currently use a 'struct btrfs_ioctl_vol_args'. Since the string is now properly terminated, it's length will never be longer than BTRFS_PATH_NAME_MAX so that error check has been removed. By the way, it might be better overall to just have the ioctl pass an unterminated string + length structure but I didn't bother with that since it'd change the kernel/user interface. Signed-off-by: Mark Fasheh Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7d40778a90e4..5e627746c4e8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -310,11 +310,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) ret = -EFAULT; goto out; } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); - if (namelen > BTRFS_VOL_NAME_MAX) { - ret = -EINVAL; - goto out; - } mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; @@ -412,11 +410,8 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root, goto out; } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); - if (namelen > BTRFS_VOL_NAME_MAX) { - ret = -EINVAL; - goto out; - } if (strchr(vol_args->name, '/')) { ret = -EINVAL; goto out; @@ -487,6 +482,7 @@ long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) ret = -EFAULT; goto out; } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); out: @@ -508,6 +504,7 @@ long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) ret = -EFAULT; goto out; } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); out: -- cgit v1.2.2 From f87f057b49ee52cf5c627ab27a706e3252767c9f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 1 Aug 2008 11:27:23 -0400 Subject: Btrfs: Improve and cleanup locking done by walk_down_tree While dropping snapshots, walk_down_tree does most of the work of checking reference counts and limiting tree traversal to just the blocks that we are freeing. It dropped and held the allocation mutex in strange and confusing ways, this commit changes it to only hold the mutex while actually freeing a block. The rest of the checks around reference counts should be safe without the lock because we only allow one process in btrfs_drop_snapshot at a time. Other processes dropping reference counts should not drop it to 1 because their tree roots already have an extra ref on the block. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5e627746c4e8..224da287b3ed 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -268,6 +268,12 @@ again: } set_page_extent_mapped(page); + /* + * this makes sure page_mkwrite is called on the + * page if it is dirtied again later + */ + clear_page_dirty_for_io(page); + set_extent_delalloc(io_tree, page_start, page_end, GFP_NOFS); -- cgit v1.2.2 From 9ca9ee09c176a814189063c8b88f75c8f8e4ad19 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 4 Aug 2008 10:41:27 -0400 Subject: Btrfs: fix ioctl-initiated transactions vs wait_current_trans() Commit 597:466b27332893 (btrfs_start_transaction: wait for commits in progress) breaks the transaction start/stop ioctls by making btrfs_start_transaction conditionally wait for the next transaction to start. If an application artificially is holding a transaction open, things deadlock. This workaround maintains a count of open ioctl-initiated transactions in fs_info, and avoids wait_current_trans() if any are currently open (in start_transaction() and btrfs_throttle()). The start transaction ioctl uses a new btrfs_start_ioctl_transaction() that _does_ call wait_current_trans(), effectively pushing the join/wait decision to the outer ioctl-initiated transaction. This more or less neuters btrfs_throttle() when ioctl-initiated transactions are in use, but that seems like a pretty fundamental consequence of wrapping lots of write()'s in a transaction. Btrfs has no way to tell if the application considers a given operation as part of it's transaction. Obviously, if the transaction start/stop ioctls aren't being used, there is no effect on current behavior. Signed-off-by: Sage Weil --- ctree.h | 1 + ioctl.c | 12 +++++++++++- transaction.c | 18 +++++++++++++----- transaction.h | 2 ++ 4 files changed, 27 insertions(+), 6 deletions(-) Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 224da287b3ed..0b63c3c77cfd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -715,7 +715,12 @@ long btrfs_ioctl_trans_start(struct file *file) ret = -EINPROGRESS; goto out; } - trans = btrfs_start_transaction(root, 0); + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans++; + mutex_unlock(&root->fs_info->trans_mutex); + + trans = btrfs_start_ioctl_transaction(root, 0); if (trans) file->private_data = trans; else @@ -745,6 +750,11 @@ long btrfs_ioctl_trans_end(struct file *file) } btrfs_end_transaction(trans, root); file->private_data = 0; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + out: return ret; } -- cgit v1.2.2 From ea8c281947950fac5f78818b767821d696c9512a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 4 Aug 2008 23:17:27 -0400 Subject: Btrfs: Maintain a list of inodes that are delalloc and a way to wait on them Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b63c3c77cfd..e1046a54b1c5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -274,8 +274,7 @@ again: */ clear_page_dirty_for_io(page); - set_extent_delalloc(io_tree, page_start, - page_end, GFP_NOFS); + btrfs_set_extent_delalloc(inode, page_start, page_end); unlock_extent(io_tree, page_start, page_end, GFP_NOFS); set_page_dirty(page); @@ -784,6 +783,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_TRANS_END: return btrfs_ioctl_trans_end(file); case BTRFS_IOC_SYNC: + btrfs_start_delalloc_inodes(root); btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; } -- cgit v1.2.2 From ae01a0abf343aefe923ace5c1a8c634adfbe29c5 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Mon, 4 Aug 2008 23:23:47 -0400 Subject: Btrfs: Update clone file ioctl This patch updates the file clone ioctl for the tree locking and new data ordered code. --- Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 150 ++++++++++++++++++++++++++----------------------------- 1 file changed, 70 insertions(+), 80 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e1046a54b1c5..3932c7cd0fae 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -517,32 +517,6 @@ out: return ret; } -int dup_item_to_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf, - int slot, - struct btrfs_key *key, - u64 destino) -{ - char *dup; - int len = btrfs_item_size_nr(leaf, slot); - struct btrfs_key ckey = *key; - int ret = 0; - - dup = kmalloc(len, GFP_NOFS); - if (!dup) - return -ENOMEM; - - read_extent_buffer(leaf, dup, btrfs_item_ptr_offset(leaf, slot), len); - btrfs_release_path(root, path); - - ckey.objectid = destino; - ret = btrfs_insert_item(trans, root, &ckey, dup, len); - kfree(dup); - return ret; -} - long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) { struct inode *inode = fdentry(file)->d_inode; @@ -550,22 +524,41 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) struct file *src_file; struct inode *src; struct btrfs_trans_handle *trans; - int ret; - u64 pos; + struct btrfs_ordered_extent *ordered; struct btrfs_path *path; - struct btrfs_key key; struct extent_buffer *leaf; + char *buf; + struct btrfs_key key; + struct btrfs_key new_key; + u32 size; u32 nritems; int slot; + int ret; src_file = fget(src_fd); if (!src_file) return -EBADF; src = src_file->f_dentry->d_inode; + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + ret = -EXDEV; - if (src->i_sb != inode->i_sb) + if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) + goto out_fput; + + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + goto out_fput; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); goto out_fput; + } + path->reada = 2; if (inode < src) { mutex_lock(&inode->i_mutex); @@ -582,24 +575,22 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) /* do any pending delalloc/csum calc on src, one way or another, and lock file content */ while (1) { - filemap_write_and_wait(src->i_mapping); lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); - if (BTRFS_I(src)->delalloc_bytes == 0) + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) break; unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(src, 0, (u64)-1); } - trans = btrfs_start_transaction(root, 0); - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - key.offset = 0; - key.type = BTRFS_EXTENT_DATA_KEY; + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + key.objectid = src->i_ino; - pos = 0; - path->reada = 2; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; while (1) { /* @@ -610,18 +601,19 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) if (ret < 0) goto out; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; if (ret > 0) break; + nritems = btrfs_header_nritems(path->nodes[0]); } leaf = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); - nritems = btrfs_header_nritems(leaf); + btrfs_item_key_to_cpu(leaf, &key, slot); if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || key.objectid != src->i_ino) break; @@ -629,66 +621,64 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { struct btrfs_file_extent_item *extent; int found_type; - pos = key.offset; + extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, extent); if (found_type == BTRFS_FILE_EXTENT_REG) { - u64 len = btrfs_file_extent_num_bytes(leaf, - extent); u64 ds = btrfs_file_extent_disk_bytenr(leaf, extent); u64 dl = btrfs_file_extent_disk_num_bytes(leaf, extent); - u64 off = btrfs_file_extent_offset(leaf, - extent); - btrfs_insert_file_extent(trans, root, - inode->i_ino, pos, - ds, dl, len, off); /* ds == 0 means there's a hole */ if (ds != 0) { - btrfs_inc_extent_ref(trans, root, + ret = btrfs_inc_extent_ref(trans, root, ds, dl, root->root_key.objectid, trans->transid, - inode->i_ino, pos); + inode->i_ino, key.offset); + if (ret) + goto out; } - pos = key.offset + len; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - ret = dup_item_to_inode(trans, root, path, - leaf, slot, &key, - inode->i_ino); - if (ret) - goto out; - pos = key.offset + btrfs_item_size_nr(leaf, - slot); } - } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { - ret = dup_item_to_inode(trans, root, path, leaf, - slot, &key, inode->i_ino); + } - if (ret) - goto out; + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY || + btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + btrfs_release_path(root, path); + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = inode->i_ino; + ret = btrfs_insert_item(trans, root, &new_key, + buf, size); + BUG_ON(ret); + } else { + btrfs_release_path(root, path); } key.offset++; - btrfs_release_path(root, path); } - ret = 0; out: - btrfs_free_path(path); - - inode->i_blocks = src->i_blocks; - i_size_write(inode, src->i_size); - btrfs_update_inode(trans, root, inode); - - unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); - + btrfs_release_path(root, path); + if (ret == 0) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_blocks = src->i_blocks; + btrfs_i_size_write(inode, src->i_size); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + } btrfs_end_transaction(trans, root); - + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + if (ret) + vmtruncate(inode, 0); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); + vfree(buf); + btrfs_free_path(path); out_fput: fput(src_file); return ret; -- cgit v1.2.2 From 7ea394f1192bee1af67ea4762c88ef4b7b0487a8 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Tue, 5 Aug 2008 13:05:02 -0400 Subject: Btrfs: Fix nodatacow for the new data=ordered mode Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3932c7cd0fae..59b64c738fd1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" -- cgit v1.2.2 From b214107eda845f9a5851ae198f5b972e0dc30c45 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Sep 2008 16:43:31 -0400 Subject: Btrfs: trivial sparse fixes Fix a bunch of trivial sparse complaints. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 59b64c738fd1..f84b5f6991cc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -739,7 +739,7 @@ long btrfs_ioctl_trans_end(struct file *file) goto out; } btrfs_end_transaction(trans, root); - file->private_data = 0; + file->private_data = NULL; mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans--; -- cgit v1.2.2 From 31840ae1a6b433ca0e6a8d341756ff478bbf959e Mon Sep 17 00:00:00 2001 From: Zheng Yan Date: Tue, 23 Sep 2008 13:14:14 -0400 Subject: Btrfs: Full back reference support This patch makes the back reference system to explicit record the location of parent node for all types of extents. The location of parent node is placed into the offset field of backref key. Every time a tree block is balanced, the back references for the affected lower level extents are updated. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 57 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f84b5f6991cc..4c6e0c15754d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -76,9 +76,8 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - objectid, trans->transid, 0, 0, - 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + objectid, trans->transid, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -525,13 +524,10 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) struct file *src_file; struct inode *src; struct btrfs_trans_handle *trans; - struct btrfs_ordered_extent *ordered; struct btrfs_path *path; struct extent_buffer *leaf; char *buf; struct btrfs_key key; - struct btrfs_key new_key; - u32 size; u32 nritems; int slot; int ret; @@ -576,6 +572,7 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) /* do any pending delalloc/csum calc on src, one way or another, and lock file content */ while (1) { + struct btrfs_ordered_extent *ordered; lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) @@ -619,6 +616,32 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) key.objectid != src->i_ino) break; + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY || + btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { + u32 size; + struct btrfs_key new_key; + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + btrfs_release_path(root, path); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = inode->i_ino; + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + btrfs_mark_buffer_dirty(leaf); + } + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { struct btrfs_file_extent_item *extent; int found_type; @@ -634,31 +657,15 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) /* ds == 0 means there's a hole */ if (ds != 0) { ret = btrfs_inc_extent_ref(trans, root, - ds, dl, + ds, dl, leaf->start, root->root_key.objectid, trans->transid, inode->i_ino, key.offset); - if (ret) - goto out; + BUG_ON(ret); } } } - - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY || - btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - btrfs_release_path(root, path); - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = inode->i_ino; - ret = btrfs_insert_item(trans, root, &new_key, - buf, size); - BUG_ON(ret); - } else { - btrfs_release_path(root, path); - } + btrfs_release_path(root, path); key.offset++; } ret = 0; -- cgit v1.2.2 From 5b21f2ed3f2947b5195b65c9fdbdd9e52904cc03 Mon Sep 17 00:00:00 2001 From: Zheng Yan Date: Fri, 26 Sep 2008 10:05:38 -0400 Subject: Btrfs: extent_map and data=ordered fixes for space balancing * Add an EXTENT_BOUNDARY state bit to keep the writepage code from merging data extents that are in the process of being relocated. This allows us to do accounting for them properly. * The balancing code relocates data extents indepdent of the underlying inode. The extent_map code was modified to properly account for things moving around (invalidating extent_map caches in the inode). * Don't take the drop_mutex in the create_subvol ioctl. It isn't required. * Fix walking of the ordered extent list to avoid races with sys_unlink * Change the lock ordering rules. Transaction start goes outside the drop_mutex. This allows btrfs_commit_transaction to directly drop the relocation trees. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4c6e0c15754d..04de767a8db2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -444,12 +444,10 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root, goto out; } - mutex_lock(&root->fs_info->drop_mutex); if (root == root->fs_info->tree_root) ret = create_subvol(root, vol_args->name, namelen); else ret = create_snapshot(root, vol_args->name, namelen); - mutex_unlock(&root->fs_info->drop_mutex); out: kfree(vol_args); return ret; -- cgit v1.2.2 From a76a3cd40c1127ca199d4f7f37bf0d541bf44eb2 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 9 Oct 2008 11:46:29 -0400 Subject: Btrfs: Count space allocated to file in bytes This patch makes btrfs count space allocated to file in bytes instead of 512 byte sectors. Everything else in btrfs uses a byte count instead of sector sizes or blocks sizes, so this fits better. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 04de767a8db2..ab7a0f61ded8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -99,7 +99,7 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, inode_item->generation = cpu_to_le64(1); inode_item->size = cpu_to_le64(3); inode_item->nlink = cpu_to_le32(1); - inode_item->nblocks = cpu_to_le64(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); inode_item->mode = cpu_to_le32(S_IFDIR | 0755); btrfs_set_root_bytenr(&root_item, leaf->start); @@ -671,7 +671,7 @@ out: btrfs_release_path(root, path); if (ret == 0) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_blocks = src->i_blocks; + inode_set_bytes(inode, inode_get_bytes(src)); btrfs_i_size_write(inode, src->i_size); BTRFS_I(inode)->flags = BTRFS_I(src)->flags; ret = btrfs_update_inode(trans, root, inode); -- cgit v1.2.2 From 3bb1a1bc42f2ae9582c28adf620484efcd4da38d Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 9 Oct 2008 11:46:24 -0400 Subject: Btrfs: Remove offset field from struct btrfs_extent_ref The offset field in struct btrfs_extent_ref records the position inside file that file extent is referenced by. In the new back reference system, tree leaves holding references to file extent are recorded explicitly. We can scan these tree leaves very quickly, so the offset field is not required. This patch also makes the back reference system check the objectid when extents are in deleting. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab7a0f61ded8..50c8a066d1f0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -658,7 +658,7 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) ds, dl, leaf->start, root->root_key.objectid, trans->transid, - inode->i_ino, key.offset); + inode->i_ino); BUG_ON(ret); } } -- cgit v1.2.2 From cb8e70901d36f32017614f16d2cf7cc089544574 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Oct 2008 13:39:39 -0400 Subject: Btrfs: Fix subvolume creation locking rules Creating a subvolume is in many ways like a normal VFS ->mkdir, and we really need to play with the VFS topology locking rules. So instead of just creating the snapshot on disk and then later getting rid of confliting aliases do it correctly from the start. This will become especially important once we allow for subvolumes anywhere in the tree, and not just below a hidden root. Note that snapshots will need the same treatment, but do to the delay in creating them we can't do it currently. Chris promised to fix that issue, so I'll wait on that. Signed-off-by: Christoph Hellwig --- fs/btrfs/ioctl.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 50c8a066d1f0..3d85f18bbba6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,12 +29,15 @@ #include #include #include +#include #include +#include #include #include #include #include #include +#include #include #include #include @@ -48,8 +52,9 @@ -static noinline int create_subvol(struct btrfs_root *root, char *name, - int namelen) +static noinline int create_subvol(struct btrfs_root *root, + struct dentry *dentry, + char *name, int namelen) { struct btrfs_trans_handle *trans; struct btrfs_key key; @@ -151,14 +156,11 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, trans = btrfs_start_transaction(new_root, 1); BUG_ON(!trans); - ret = btrfs_create_subvol_root(new_root, trans, new_dirid, + ret = btrfs_create_subvol_root(new_root, dentry, trans, new_dirid, BTRFS_I(dir)->block_group); if (ret) goto fail; - /* Invalidate existing dcache entry for new subvolume. */ - btrfs_invalidate_dcache_root(root, name, namelen); - fail: nr = trans->blocks_used; err = btrfs_commit_transaction(trans, new_root); @@ -210,6 +212,79 @@ fail_unlock: return ret; } +/* copy of may_create in fs/namei.c() */ +static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * Create a new subvolume below @parent. This is largely modeled after + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +static noinline int btrfs_mksubvol(struct path *parent, char *name, + int mode, int namelen) +{ + struct dentry *dentry; + int error; + + mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = -EEXIST; + if (dentry->d_inode) + goto out_dput; + + if (!IS_POSIXACL(parent->dentry->d_inode)) + mode &= ~current->fs->umask; + error = mnt_want_write(parent->mnt); + if (error) + goto out_dput; + + error = btrfs_may_create(parent->dentry->d_inode, dentry); + if (error) + goto out_drop_write; + + mode &= (S_IRWXUGO|S_ISVTX); + error = security_inode_mkdir(parent->dentry->d_inode, dentry, mode); + if (error) + goto out_drop_write; + + /* + * Actually perform the low-level subvolume creation after all + * this VFS fuzz. + * + * Eventually we want to pass in an inode under which we create this + * subvolume, but for now all are under the filesystem root. + * + * Also we should pass on the mode eventually to allow creating new + * subvolume with specific mode bits. + */ + error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, dentry, + name, namelen); + if (error) + goto out_drop_write; + + fsnotify_mkdir(parent->dentry->d_inode, dentry); +out_drop_write: + mnt_drop_write(parent->mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&parent->dentry->d_inode->i_mutex); + return error; +} + + int btrfs_defrag_file(struct file *file) { struct inode *inode = fdentry(file)->d_inode; @@ -395,9 +470,10 @@ out: return ret; } -static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root, +static noinline int btrfs_ioctl_snap_create(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_dir_item *di; struct btrfs_path *path; @@ -444,10 +520,14 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root, goto out; } - if (root == root->fs_info->tree_root) - ret = create_subvol(root, vol_args->name, namelen); - else + if (root == root->fs_info->tree_root) { + ret = btrfs_mksubvol(&file->f_path, vol_args->name, + file->f_path.dentry->d_inode->i_mode, + namelen); + } else { ret = create_snapshot(root, vol_args->name, namelen); + } + out: kfree(vol_args); return ret; @@ -761,7 +841,7 @@ long btrfs_ioctl(struct file *file, unsigned int switch (cmd) { case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(root, (void __user *)arg); + return btrfs_ioctl_snap_create(file, (void __user *)arg); case BTRFS_IOC_DEFRAG: return btrfs_ioctl_defrag(file); case BTRFS_IOC_RESIZE: -- cgit v1.2.2 From a3dddf3fc82a1f5942c0928abfd114e9a8c0d0c8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 10 Oct 2008 10:23:22 -0400 Subject: Btrfs: Don't call security_inode_mkdir during subvol creation Subvol creation already requires privs, and security_inode_mkdir isn't exported. For now we don't need it. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3d85f18bbba6..1136ce2febcc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -254,11 +254,6 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, if (error) goto out_drop_write; - mode &= (S_IRWXUGO|S_ISVTX); - error = security_inode_mkdir(parent->dentry->d_inode, dentry, mode); - if (error) - goto out_drop_write; - /* * Actually perform the low-level subvolume creation after all * this VFS fuzz. -- cgit v1.2.2 From 84234f3a1f7c532e4afeba03cc8e7e4a8a5277ea Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Wed, 29 Oct 2008 14:49:05 -0400 Subject: Btrfs: Add root tree pointer transaction ids This patch adds transaction IDs to root tree pointers. Transaction IDs in tree pointers are compared with the generation numbers in block headers when reading root blocks of trees. This can detect some types of IO errors. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1136ce2febcc..fd3c8b5676c1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -108,6 +108,7 @@ static noinline int create_subvol(struct btrfs_root *root, inode_item->mode = cpu_to_le32(S_IFDIR | 0755); btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); btrfs_set_root_level(&root_item, 0); btrfs_set_root_refs(&root_item, 1); btrfs_set_root_used(&root_item, 0); -- cgit v1.2.2 From 80ff385665b7fca29fefe358a60ab0d09f9b8e87 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 30 Oct 2008 14:20:02 -0400 Subject: Btrfs: update nodatacow code v2 This patch simplifies the nodatacow checker. If all references were created after the latest snapshot, then we can avoid COW safely. This patch also updates run_delalloc_nocow to do more fine-grained checking. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fd3c8b5676c1..7f915d478399 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -112,6 +112,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_level(&root_item, 0); btrfs_set_root_refs(&root_item, 1); btrfs_set_root_used(&root_item, 0); + btrfs_set_root_last_snapshot(&root_item, 0); memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); root_item.drop_level = 0; -- cgit v1.2.2 From d899e05215178fed903ad0e7fc1cb4d8e0cc0a88 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 30 Oct 2008 14:25:28 -0400 Subject: Btrfs: Add fallocate support v2 This patch updates btrfs-progs for fallocate support. fallocate is a little different in Btrfs because we need to tell the COW system that a given preallocated extent doesn't need to be cow'd as long as there are no snapshots of it. This leverages the -o nodatacow checks. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7f915d478399..9ff2b4e0e922 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -724,7 +724,8 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 ds = btrfs_file_extent_disk_bytenr(leaf, extent); u64 dl = btrfs_file_extent_disk_num_bytes(leaf, -- cgit v1.2.2 From c5c9cd4d1b827fe545ed2a945e91e3a6909f3886 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 12 Nov 2008 14:32:25 -0500 Subject: Btrfs: allow clone of an arbitrary file range This patch adds an additional CLONE_RANGE ioctl to clone an arbitrary (block-aligned) file range to another file. The original CLONE ioctl becomes a special case of cloning the entire file range. The logic is a bit more complex now since ranges may be cloned to different offsets, and because we may only be cloning the beginning or end of a particular extent or checksum item. An additional sanity check ensures the source and destination files aren't the same (which would previously deadlock), although eventually this could be extended to allow the duplication of file data at a different offset within the same file. Any extents within the destination range in the target file are dropped. We currently do not cope with the case where a compressed inline extent needs to be split. This will probably require decompressing the extent into a temporary address_space, and inserting just the cloned portion as a new compressed inline extent. For now, just return -EINVAL in this case. Note that this never comes up in the more common case of cloning an entire file. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 212 insertions(+), 41 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9ff2b4e0e922..4d7cc7c504d0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -592,7 +592,8 @@ out: return ret; } -long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) +long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off, + u64 olen, u64 destoff) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -606,12 +607,29 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) u32 nritems; int slot; int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + u64 hint_byte; - src_file = fget(src_fd); + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * - allow ranges within the same file to be cloned (provided + * they don't overlap)? + */ + + src_file = fget(srcfd); if (!src_file) return -EBADF; src = src_file->f_dentry->d_inode; + ret = -EINVAL; + if (src == inode) + goto out_fput; + ret = -EISDIR; if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) goto out_fput; @@ -640,27 +658,46 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) mutex_lock(&inode->i_mutex); } - ret = -ENOTEMPTY; - if (inode->i_size) + /* determine range to clone */ + ret = -EINVAL; + if (off >= src->i_size || off + len > src->i_size) goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ((src->i_size + bs-1) & ~(bs-1)) + - off; + + /* verify the end result is block aligned */ + if ((off & (bs-1)) || + ((off + len) & (bs-1))) + goto out_unlock; + + printk("final src extent is %llu~%llu\n", off, len); + printk("final dst extent is %llu~%llu\n", destoff, len); /* do any pending delalloc/csum calc on src, one way or another, and lock file content */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, off+len); if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) break; - unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); if (ordered) btrfs_put_ordered_extent(ordered); - btrfs_wait_ordered_range(src, 0, (u64)-1); + btrfs_wait_ordered_range(src, off, off+len); } trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); + /* punch hole in destination first */ + btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + + /* clone data */ key.objectid = src->i_ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; @@ -691,56 +728,178 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) key.objectid != src->i_ino) break; - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY || - btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int type; u32 size; struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG) { + disko = btrfs_file_extent_disk_bytenr(leaf, extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, + extent); + } btrfs_release_path(root, path); + if (key.offset + datal < off || + key.offset >= off+len) + goto next; + memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = inode->i_ino; - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) - goto out; + new_key.offset = key.offset + destoff - off; - leaf = path->nodes[0]; - slot = path->slots[0]; - write_extent_buffer(leaf, buf, + if (type == BTRFS_FILE_EXTENT_REG) { + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); - btrfs_mark_buffer_dirty(leaf); - } - - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { - struct btrfs_file_extent_item *extent; - int found_type; - extent = btrfs_item_ptr(leaf, slot, + extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 ds = btrfs_file_extent_disk_bytenr(leaf, - extent); - u64 dl = btrfs_file_extent_disk_num_bytes(leaf, - extent); - /* ds == 0 means there's a hole */ - if (ds != 0) { + printk(" orig disk %llu~%llu data %llu~%llu\n", + disko, diskl, datao, datal); + + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + if (key.offset + datao + datal + key.offset > + off + len) + datal = off + len - key.offset - datao; + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; + printk(" final disk %llu~%llu data %llu~%llu\n", + disko, diskl, datao, datal); + + btrfs_set_file_extent_offset(leaf, extent, + datao); + btrfs_set_file_extent_num_bytes(leaf, extent, + datal); + if (disko) { + inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, - ds, dl, leaf->start, - root->root_key.objectid, - trans->transid, - inode->i_ino); + disko, diskl, leaf->start, + root->root_key.objectid, + trans->transid, + inode->i_ino); BUG_ON(ret); } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } + if (key.offset + datal > off+len) + trim = key.offset + datal - (off+len); + printk("len %lld skip %lld trim %lld\n", + datal, skip, trim); + if (comp && (skip || trim)) { + printk("btrfs clone_range can't split compressed inline extents yet\n"); + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + if (skip) { + u32 start = btrfs_file_extent_calc_inline_size(0); + memmove(buf+start, buf+start+skip, + datal); + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + inode_add_bytes(inode, datal); } + + btrfs_mark_buffer_dirty(leaf); } + + if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { + u32 size; + struct btrfs_key new_key; + u64 coverslen; + int coff, clen; + + size = btrfs_item_size_nr(leaf, slot); + coverslen = (size / BTRFS_CRC32_SIZE) << + root->fs_info->sb->s_blocksize_bits; + printk("csums for %llu~%llu\n", + key.offset, coverslen); + if (key.offset + coverslen < off || + key.offset >= off+len) + goto next; + + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + btrfs_release_path(root, path); + + coff = 0; + if (off > key.offset) + coff = ((off - key.offset) >> + root->fs_info->sb->s_blocksize_bits) * + BTRFS_CRC32_SIZE; + clen = size - coff; + if (key.offset + coverslen > off+len) + clen -= ((key.offset+coverslen-off-len) >> + root->fs_info->sb->s_blocksize_bits) * + BTRFS_CRC32_SIZE; + printk(" will dup %d~%d of %d\n", + coff, clen, size); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = inode->i_ino; + new_key.offset = key.offset + destoff - off; + + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, clen); + if (ret) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf + coff, + btrfs_item_ptr_offset(leaf, slot), + clen); + btrfs_mark_buffer_dirty(leaf); + } + + next: btrfs_release_path(root, path); key.offset++; } @@ -749,13 +908,13 @@ out: btrfs_release_path(root, path); if (ret == 0) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode_set_bytes(inode, inode_get_bytes(src)); - btrfs_i_size_write(inode, src->i_size); + if (destoff + olen > inode->i_size) + btrfs_i_size_write(inode, destoff + olen); BTRFS_I(inode)->flags = BTRFS_I(src)->flags; ret = btrfs_update_inode(trans, root, inode); } btrfs_end_transaction(trans, root); - unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); if (ret) vmtruncate(inode, 0); out_unlock: @@ -768,6 +927,16 @@ out_fput: return ret; } +long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr) +{ + struct btrfs_ioctl_clone_range_args args; + + if (copy_from_user(&args, (void *)argptr, sizeof(args))) + return -EFAULT; + return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + /* * there are many ways the trans_start and trans_end ioctls can lead * to deadlocks. They should only be used by applications that @@ -851,7 +1020,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_BALANCE: return btrfs_balance(root->fs_info->dev_root); case BTRFS_IOC_CLONE: - return btrfs_ioctl_clone(file, arg); + return btrfs_ioctl_clone(file, arg, 0, 0, 0); + case BTRFS_IOC_CLONE_RANGE: + return btrfs_ioctl_clone_range(file, arg); case BTRFS_IOC_TRANS_START: return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: -- cgit v1.2.2 From c146afad2c7fea6a366d4945c1bab9b03880f526 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Wed, 12 Nov 2008 14:34:12 -0500 Subject: Btrfs: mount ro and remount support This patch adds mount ro and remount support. The main changes in patch are: adding btrfs_remount and related helper function; splitting the transaction related code out of close_ctree into btrfs_commit_super; updating allocator to properly handle read only block group. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4d7cc7c504d0..52863cebd594 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,6 +378,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) int namelen; int mod = 0; + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); if (!vol_args) @@ -478,6 +481,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, int namelen; int ret; + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); if (!vol_args) @@ -534,6 +540,11 @@ static int btrfs_ioctl_defrag(struct file *file) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; switch (inode->i_mode & S_IFMT) { case S_IFDIR: @@ -575,6 +586,9 @@ long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) struct btrfs_ioctl_vol_args *vol_args; int ret; + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); if (!vol_args) @@ -621,6 +635,10 @@ long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off, * they don't overlap)? */ + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + src_file = fget(srcfd); if (!src_file) return -EBADF; @@ -958,6 +976,10 @@ long btrfs_ioctl_trans_start(struct file *file) goto out; } + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out; + mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans++; mutex_unlock(&root->fs_info->trans_mutex); -- cgit v1.2.2 From 2b82032c34ec40515d3c45c36cd1961f37977de8 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Mon, 17 Nov 2008 21:11:30 -0500 Subject: Btrfs: Seed device support Seed device is a special btrfs with SEEDING super flag set and can only be mounted in read-only mode. Seed devices allow people to create new btrfs on top of it. The new FS contains the same contents as the seed device, but it can be mounted in read-write mode. This patch does the following: 1) split code in btrfs_alloc_chunk into two parts. The first part does makes the newly allocated chunk usable, but does not do any operation that modifies the chunk tree. The second part does the the chunk tree modifications. This division is for the bootstrap step of adding storage to the seed device. 2) Update device management code to handle seed device. The basic idea is: For an FS grown from seed devices, its seed devices are put into a list. Seed devices are opened on demand at mounting time. If any seed device is missing or has been changed, btrfs kernel module will refuse to mount the FS. 3) make btrfs_find_block_group not return NULL when all block groups are read-only. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 52863cebd594..f43df72b0e17 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -405,7 +405,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) devid = simple_strtoull(devstr, &end, 10); printk(KERN_INFO "resizing devid %llu\n", devid); } - device = btrfs_find_device(root, devid, NULL); + device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { printk(KERN_INFO "resizer unable to find device %llu\n", devid); ret = -EINVAL; -- cgit v1.2.2 From 3de4586c5278a28107030c336956381f69ff7a9d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 17 Nov 2008 21:02:50 -0500 Subject: Btrfs: Allow subvolumes and snapshots anywhere in the directory tree Before, all snapshots and subvolumes lived in a single flat directory. This was awkward and confusing because the single flat directory was only writable with the ioctls. This commit changes the ioctls to create subvols and snapshots at any point in the directory tree. This requires making separate ioctls for snapshot and subvol creation instead of a combining them into one. The subvol ioctl does: btrfsctl -S subvol_name parent_dir After the ioctl is done subvol_name lives inside parent_dir. The snapshot ioctl does: btrfsctl -s path_for_snapshot root_to_snapshot path_for_snapshot can be an absolute or relative path. btrfsctl breaks it up into directory and basename components. root_to_snapshot can be any file or directory in the FS. The snapshot is taken of the entire root where that file lives. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 71 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f43df72b0e17..ec45b3086136 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -67,6 +67,7 @@ static noinline int create_subvol(struct btrfs_root *root, int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; unsigned long nr = 1; ret = btrfs_check_free_space(root, 1, 0); @@ -126,6 +127,7 @@ static noinline int create_subvol(struct btrfs_root *root, key.objectid = objectid; key.offset = 1; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); +printk("inserting root objectid %Lu\n", objectid); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); if (ret) @@ -135,24 +137,27 @@ static noinline int create_subvol(struct btrfs_root *root, * insert the directory item */ key.offset = (u64)-1; - dir = root->fs_info->sb->s_root->d_inode; - ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, + dir = dentry->d_parent->d_inode; + ret = btrfs_set_inode_index(dir, &index); + BUG_ON(ret); + + ret = btrfs_insert_dir_item(trans, root, name, namelen, dir->i_ino, &key, - BTRFS_FT_DIR, 0); + BTRFS_FT_DIR, index); if (ret) goto fail; - +#if 0 ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root, name, namelen, objectid, root->fs_info->sb->s_root->d_inode->i_ino, 0); if (ret) goto fail; - +#endif ret = btrfs_commit_transaction(trans, root); if (ret) goto fail_commit; - new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen); + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); BUG_ON(!new_root); trans = btrfs_start_transaction(new_root, 1); @@ -170,14 +175,16 @@ fail: ret = err; fail_commit: btrfs_btree_balance_dirty(root, nr); +printk("all done ret %d\n", ret); return ret; } -static int create_snapshot(struct btrfs_root *root, char *name, int namelen) +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen) { struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; - int ret; + int ret = 0; int err; unsigned long nr = 0; @@ -188,7 +195,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) if (ret) goto fail_unlock; - pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS); + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; goto fail_unlock; @@ -201,12 +208,12 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) } memcpy(pending_snapshot->name, name, namelen); pending_snapshot->name[namelen] = '\0'; + pending_snapshot->dentry = dentry; trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); pending_snapshot->root = root; list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - ret = btrfs_update_inode(trans, root, root->inode); err = btrfs_commit_transaction(trans, root); fail_unlock: @@ -230,7 +237,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(struct path *parent, char *name, - int mode, int namelen) + int mode, int namelen, + struct btrfs_root *snap_src) { struct dentry *dentry; int error; @@ -248,6 +256,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, if (!IS_POSIXACL(parent->dentry->d_inode)) mode &= ~current->fs->umask; + error = mnt_want_write(parent->mnt); if (error) goto out_dput; @@ -266,8 +275,12 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, * Also we should pass on the mode eventually to allow creating new * subvolume with specific mode bits. */ - error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, dentry, - name, namelen); + if (snap_src) { + error = create_snapshot(snap_src, dentry, name, namelen); + } else { + error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, + dentry, name, namelen); + } if (error) goto out_drop_write; @@ -471,15 +484,16 @@ out: } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg) + void __user *arg, int subvol) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_dir_item *di; struct btrfs_path *path; + struct file *src_file; u64 root_dirid; int namelen; - int ret; + int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; @@ -523,12 +537,29 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, goto out; } - if (root == root->fs_info->tree_root) { + if (subvol) { ret = btrfs_mksubvol(&file->f_path, vol_args->name, file->f_path.dentry->d_inode->i_mode, - namelen); + namelen, NULL); } else { - ret = create_snapshot(root, vol_args->name, namelen); + struct inode *src_inode; + src_file = fget(vol_args->fd); + if (!src_file) { + ret = -EINVAL; + goto out; + } + + src_inode = src_file->f_path.dentry->d_inode; + if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { + printk("btrfs: Snapshot src from another FS\n"); + ret = -EINVAL; + fput(src_file); + goto out; + } + ret = btrfs_mksubvol(&file->f_path, vol_args->name, + file->f_path.dentry->d_inode->i_mode, + namelen, BTRFS_I(src_inode)->root); + fput(src_file); } out: @@ -1030,7 +1061,9 @@ long btrfs_ioctl(struct file *file, unsigned int switch (cmd) { case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, (void __user *)arg); + return btrfs_ioctl_snap_create(file, (void __user *)arg, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, (void __user *)arg, 1); case BTRFS_IOC_DEFRAG: return btrfs_ioctl_defrag(file); case BTRFS_IOC_RESIZE: -- cgit v1.2.2 From 3394e1607eaf870ebba37d303fbd590a4c569908 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 17 Nov 2008 20:42:26 -0500 Subject: Btrfs: Give each subvol and snapshot their own anonymous devid Each subvolume has its own private inode number space, and so we need to fill in different device numbers for each subvolume to avoid confusing applications. This commit puts a struct super_block into struct btrfs_root so it can call set_anon_super() and get a different device number generated for each root. btrfs_rename is changed to prevent renames across subvols. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ec45b3086136..773db07b5f72 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -127,7 +127,6 @@ static noinline int create_subvol(struct btrfs_root *root, key.objectid = objectid; key.offset = 1; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); -printk("inserting root objectid %Lu\n", objectid); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); if (ret) @@ -175,7 +174,6 @@ fail: ret = err; fail_commit: btrfs_btree_balance_dirty(root, nr); -printk("all done ret %d\n", ret); return ret; } -- cgit v1.2.2 From 0660b5af3f7ac0fac69de975914e1f4a3a586fb3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 17 Nov 2008 20:37:39 -0500 Subject: Btrfs: Add backrefs and forward refs for subvols and snapshots Subvols and snapshots can now be referenced from any point in the directory tree. We need to maintain back refs for them so we can find lost subvols. Forward refs are added so that we know all of the subvols and snapshots referenced anywhere in the directory tree of a single subvol. This can be used to do recursive snapshotting (but they aren't yet) and it is also used to detect and prevent directory loops when creating new snapshots. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 773db07b5f72..536ae8837801 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -145,13 +145,23 @@ static noinline int create_subvol(struct btrfs_root *root, BTRFS_FT_DIR, index); if (ret) goto fail; -#if 0 - ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root, - name, namelen, objectid, - root->fs_info->sb->s_root->d_inode->i_ino, 0); - if (ret) - goto fail; -#endif + + /* add the backref first */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + objectid, BTRFS_ROOT_BACKREF_KEY, + root->root_key.objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + + /* now add the forward ref */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + root->root_key.objectid, BTRFS_ROOT_REF_KEY, + objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + ret = btrfs_commit_transaction(trans, root); if (ret) goto fail_commit; -- cgit v1.2.2 From ea9e8b11bd1252dcbc23afefcf1a52ec6aa3c113 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 17 Nov 2008 21:14:24 -0500 Subject: Btrfs: prevent loops in the directory tree when creating snapshots For a directory tree: /mnt/subvolA/subvolB btrfsctl -s /mnt/subvolA/subvolB /mnt Will create a directory loop with subvolA under subvolB. This commit uses the forward refs for each subvol and snapshot to error out before creating the loop. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 536ae8837801..8828109fa58e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -284,6 +284,56 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, * subvolume with specific mode bits. */ if (snap_src) { + struct dentry *dir = dentry->d_parent; + struct dentry *test = dir->d_parent; + struct btrfs_path *path = btrfs_alloc_path(); + int ret; + u64 test_oid; + u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; + + test_oid = snap_src->root_key.objectid; + + ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, + path, parent_oid, test_oid); + if (ret == 0) + goto create; + btrfs_release_path(snap_src->fs_info->tree_root, path); + + /* we need to make sure we aren't creating a directory loop + * by taking a snapshot of something that has our current + * subvol in its directory tree. So, this loops through + * the dentries and checks the forward refs for each subvolume + * to see if is references the subvolume where we are + * placing this new snapshot. + */ + while(1) { + if (!test || + dir == snap_src->fs_info->sb->s_root || + test == snap_src->fs_info->sb->s_root || + test->d_inode->i_sb != snap_src->fs_info->sb) { + break; + } + if (S_ISLNK(test->d_inode->i_mode)) { + printk("Symlink in snapshot path, failed\n"); + error = -EMLINK; + btrfs_free_path(path); + goto out_drop_write; + } + test_oid = + BTRFS_I(test->d_inode)->root->root_key.objectid; + ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, + path, test_oid, parent_oid); + if (ret == 0) { + printk("Snapshot creation failed, looping\n"); + error = -EMLINK; + btrfs_free_path(path); + goto out_drop_write; + } + btrfs_release_path(snap_src->fs_info->tree_root, path); + test = test->d_parent; + } +create: + btrfs_free_path(path); error = create_snapshot(snap_src, dentry, name, namelen); } else { error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, -- cgit v1.2.2 From 4b4e25f2a6ddb070bab7f7dd2bd2926fb8db9e04 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 20 Nov 2008 10:22:27 -0500 Subject: Btrfs: compat code fixes The btrfs git kernel trees is used to build a standalone tree for compiling against older kernels. This commit makes the standalone tree work with 2.6.27 Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8828109fa58e..f3d68457e66a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -41,6 +41,7 @@ #include #include #include +#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" -- cgit v1.2.2 From 1ffa4f426c002161b7dbd58b297f5d0680e7dd6a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 2 Dec 2008 09:53:09 -0500 Subject: Btrfs: remove unneeded btrfs_start_delalloc_inodes call It is called by btrfs_sync_fs. Signed-off-by: Sage Weil --- fs/btrfs/ioctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f3d68457e66a..35f650e183e1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1142,7 +1142,6 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_TRANS_END: return btrfs_ioctl_trans_end(file); case BTRFS_IOC_SYNC: - btrfs_start_delalloc_inodes(root); btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; } -- cgit v1.2.2 From b2950863c61bc24cf0f63bc05947d9d50663c4c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Dec 2008 09:54:17 -0500 Subject: Btrfs: make things static and include the right headers Shut up various sparse warnings about symbols that should be either static or have their declarations in scope. Signed-off-by: Christoph Hellwig --- fs/btrfs/ioctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 35f650e183e1..cc7c5161e269 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -354,7 +354,7 @@ out_unlock: } -int btrfs_defrag_file(struct file *file) +static int btrfs_defrag_file(struct file *file) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -649,7 +649,7 @@ static int btrfs_ioctl_defrag(struct file *file) return 0; } -long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -671,7 +671,7 @@ out: return ret; } -long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -696,8 +696,8 @@ out: return ret; } -long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off, - u64 olen, u64 destoff) +static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1035,7 +1035,7 @@ out_fput: return ret; } -long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr) +static long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr) { struct btrfs_ioctl_clone_range_args args; @@ -1051,7 +1051,7 @@ long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr) * basically own the machine, and have a very in depth understanding * of all the possible deadlocks and enospc problems. */ -long btrfs_ioctl_trans_start(struct file *file) +static long btrfs_ioctl_trans_start(struct file *file) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; -- cgit v1.2.2 From 4bcabaa30a63a156fc50026f972377dada66452c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Dec 2008 06:36:08 -0500 Subject: Btrfs: clean up btrfs_ioctl a little bit Provide a void __user *argp pointer so that we can avoid duplicating the cast for various sub-command calls. Signed-off-by: Christoph Hellwig --- fs/btrfs/ioctl.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cc7c5161e269..d2d5a5a9b026 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1117,20 +1117,21 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + void __user *argp = (void __user *)arg; switch (cmd) { case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, (void __user *)arg, 0); + return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, (void __user *)arg, 1); + return btrfs_ioctl_snap_create(file, argp, 1); case BTRFS_IOC_DEFRAG: return btrfs_ioctl_defrag(file); case BTRFS_IOC_RESIZE: - return btrfs_ioctl_resize(root, (void __user *)arg); + return btrfs_ioctl_resize(root, argp); case BTRFS_IOC_ADD_DEV: - return btrfs_ioctl_add_dev(root, (void __user *)arg); + return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: - return btrfs_ioctl_rm_dev(root, (void __user *)arg); + return btrfs_ioctl_rm_dev(root, argp); case BTRFS_IOC_BALANCE: return btrfs_balance(root->fs_info->dev_root); case BTRFS_IOC_CLONE: -- cgit v1.2.2 From 7a865e8ac3a8ead776ea2c8c74fa2b2d00a2c9cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Dec 2008 09:52:24 -0500 Subject: Btrfs: btrfs: pass void __user * to btrfs_ioctl_clone_range Cleans the code up a little and also avoids a sparse warning due to the incorrect cast in the current version of the code. Signed-off-by: Christoph Hellwig --- fs/btrfs/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d2d5a5a9b026..caea9eed9d62 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1035,11 +1035,11 @@ out_fput: return ret; } -static long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr) +static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) { struct btrfs_ioctl_clone_range_args args; - if (copy_from_user(&args, (void *)argptr, sizeof(args))) + if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, args.src_length, args.dest_offset); @@ -1137,7 +1137,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: - return btrfs_ioctl_clone_range(file, arg); + return btrfs_ioctl_clone_range(file, argp); case BTRFS_IOC_TRANS_START: return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: -- cgit v1.2.2 From 607d432da0542e84ddcd358adfddac6f68500e3d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 2 Dec 2008 07:17:45 -0500 Subject: Btrfs: add support for multiple csum algorithms This patch gives us the space we will need in order to have different csum algorithims at some point in the future. We save the csum algorithim type in the superblock, and use those instead of define's. Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index caea9eed9d62..b4da53d55c82 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -714,7 +714,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; u64 hint_byte; - + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); /* * TODO: * - split compressed inline extents. annoying: we need to @@ -964,7 +965,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, int coff, clen; size = btrfs_item_size_nr(leaf, slot); - coverslen = (size / BTRFS_CRC32_SIZE) << + coverslen = (size / csum_size) << root->fs_info->sb->s_blocksize_bits; printk("csums for %llu~%llu\n", key.offset, coverslen); @@ -981,12 +982,12 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (off > key.offset) coff = ((off - key.offset) >> root->fs_info->sb->s_blocksize_bits) * - BTRFS_CRC32_SIZE; + csum_size; clen = size - coff; if (key.offset + coverslen > off+len) clen -= ((key.offset+coverslen-off-len) >> root->fs_info->sb->s_blocksize_bits) * - BTRFS_CRC32_SIZE; + csum_size; printk(" will dup %d~%d of %d\n", coff, clen, size); -- cgit v1.2.2 From d20f7043fa65659136c1a7c3c456eeeb5c6f431f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 8 Dec 2008 16:58:54 -0500 Subject: Btrfs: move data checksumming into a dedicated tree Btrfs stores checksums for each data block. Until now, they have been stored in the subvolume trees, indexed by the inode that is referencing the data block. This means that when we read the inode, we've probably read in at least some checksums as well. But, this has a few problems: * The checksums are indexed by logical offset in the file. When compression is on, this means we have to do the expensive checksumming on the uncompressed data. It would be faster if we could checksum the compressed data instead. * If we implement encryption, we'll be checksumming the plain text and storing that on disk. This is significantly less secure. * For either compression or encryption, we have to get the plain text back before we can verify the checksum as correct. This makes the raid layer balancing and extent moving much more expensive. * It makes the front end caching code more complex, as we have touch the subvolume and inodes as we cache extents. * There is potentitally one copy of the checksum in each subvolume referencing an extent. The solution used here is to store the extent checksums in a dedicated tree. This allows us to index the checksums by phyiscal extent start and length. It means: * The checksum is against the data stored on disk, after any compression or encryption is done. * The checksum is stored in a central location, and can be verified without following back references, or reading inodes. This makes compression significantly faster by reducing the amount of data that needs to be checksummed. It will also allow much faster raid management code in general. The checksums are indexed by a key with a fixed objectid (a magic value in ctree.h) and offset set to the starting byte of the extent. This allows us to copy the checksum items into the fsync log tree directly (or any other tree), without having to invent a second format for them. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 55 ++----------------------------------------------------- 1 file changed, 2 insertions(+), 53 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b4da53d55c82..6228b69c2b93 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -714,8 +714,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; u64 hint_byte; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + /* * TODO: * - split compressed inline extents. annoying: we need to @@ -833,7 +832,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || key.objectid != src->i_ino) break; @@ -958,56 +957,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_mark_buffer_dirty(leaf); } - if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { - u32 size; - struct btrfs_key new_key; - u64 coverslen; - int coff, clen; - - size = btrfs_item_size_nr(leaf, slot); - coverslen = (size / csum_size) << - root->fs_info->sb->s_blocksize_bits; - printk("csums for %llu~%llu\n", - key.offset, coverslen); - if (key.offset + coverslen < off || - key.offset >= off+len) - goto next; - - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - btrfs_release_path(root, path); - - coff = 0; - if (off > key.offset) - coff = ((off - key.offset) >> - root->fs_info->sb->s_blocksize_bits) * - csum_size; - clen = size - coff; - if (key.offset + coverslen > off+len) - clen -= ((key.offset+coverslen-off-len) >> - root->fs_info->sb->s_blocksize_bits) * - csum_size; - printk(" will dup %d~%d of %d\n", - coff, clen, size); - - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = inode->i_ino; - new_key.offset = key.offset + destoff - off; - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, clen); - if (ret) - goto out; - - leaf = path->nodes[0]; - slot = path->slots[0]; - write_extent_buffer(leaf, buf + coff, - btrfs_item_ptr_offset(leaf, slot), - clen); - btrfs_mark_buffer_dirty(leaf); - } - next: btrfs_release_path(root, path); key.offset++; -- cgit v1.2.2 From cfc8ea87201dc9bb6aeb3fc80c61abee83e7cc06 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 11 Dec 2008 16:30:06 -0500 Subject: Btrfs: mnt_drop_write in ioctl_trans_end Add missing mnt_drop_write to match the mnt_want_write in btrfs_ioctl_trans_start. Signed-off-by: Sage Weil --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6228b69c2b93..69c4a07f5869 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1059,6 +1059,8 @@ long btrfs_ioctl_trans_end(struct file *file) root->fs_info->open_ioctl_trans--; mutex_unlock(&root->fs_info->trans_mutex); + mnt_drop_write(file->f_path.mnt); + out: return ret; } -- cgit v1.2.2 From d2fb3437e4d8d12c73c587615ad187d5288547ec Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 11 Dec 2008 16:30:39 -0500 Subject: Btrfs: fix leaking block group on balance The block group structs are referenced in many different places, and it's not safe to free while balancing. So, those block group structs were simply leaked instead. This patch replaces the block group pointer in the inode with the starting byte offset of the block group and adds reference counting to the block group struct. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 69c4a07f5869..5d67858ce993 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -173,7 +173,7 @@ static noinline int create_subvol(struct btrfs_root *root, trans = btrfs_start_transaction(new_root, 1); BUG_ON(!trans); - ret = btrfs_create_subvol_root(new_root, dentry, trans, new_dirid, + ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, BTRFS_I(dir)->block_group); if (ret) goto fail; -- cgit v1.2.2 From ab67b7c1f780a8a321fe7ee49117775009350fb3 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Fri, 19 Dec 2008 10:58:39 -0500 Subject: Btrfs: Add missing mnt_drop_write in ioctl.c This patch adds the missing mnt_drop_write to match mnt_want_write in btrfs_ioctl_defrag and btrfs_ioctl_clone Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5d67858ce993..ab429fe0fa0f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -646,6 +646,7 @@ static int btrfs_ioctl_defrag(struct file *file) break; } + mnt_drop_write(file->f_path.mnt); return 0; } @@ -730,8 +731,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, return ret; src_file = fget(srcfd); - if (!src_file) - return -EBADF; + if (!src_file) { + ret = -EBADF; + goto out_drop_write; + } src = src_file->f_dentry->d_inode; ret = -EINVAL; @@ -982,6 +985,8 @@ out_unlock: btrfs_free_path(path); out_fput: fput(src_file); +out_drop_write: + mnt_drop_write(file->f_path.mnt); return ret; } -- cgit v1.2.2 From e441d54de4fd97dd381f3e73636f5ba51ff4c7d9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 5 Jan 2009 16:57:23 -0500 Subject: Btrfs: add permission checks to the ioctls Only root can add/remove devices Only root can defrag subtrees Only files open for writing can be defragged Only files open for writing can be the destination for a clone Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab429fe0fa0f..150784e936e6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -453,6 +453,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); if (!vol_args) @@ -638,16 +641,24 @@ static int btrfs_ioctl_defrag(struct file *file) switch (inode->i_mode & S_IFMT) { case S_IFDIR: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } btrfs_defrag_root(root, 0); btrfs_defrag_root(root->fs_info->extent_root, 0); break; case S_IFREG: + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EINVAL; + goto out; + } btrfs_defrag_file(file); break; } - +out: mnt_drop_write(file->f_path.mnt); - return 0; + return ret; } static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) @@ -655,6 +666,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) struct btrfs_ioctl_vol_args *vol_args; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); if (!vol_args) @@ -677,6 +691,9 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) struct btrfs_ioctl_vol_args *vol_args; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; @@ -726,6 +743,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * they don't overlap)? */ + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE)) + return -EINVAL; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; -- cgit v1.2.2 From 52c2617990fed072220708d6b771dc10f37547b0 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Mon, 5 Jan 2009 15:43:43 -0500 Subject: Btrfs: update directory's size when creating subvol/snapshot Make sure directory's size properly updated when creating subvol/snapshot. Signed-off-by: Yan Zheng --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 150784e936e6..ba484aac1b9c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -147,6 +147,10 @@ static noinline int create_subvol(struct btrfs_root *root, if (ret) goto fail; + btrfs_i_size_write(dir, dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + /* add the backref first */ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, objectid, BTRFS_ROOT_BACKREF_KEY, -- cgit v1.2.2 From d397712bcc6a759a560fd247e6053ecae091f958 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 5 Jan 2009 21:25:51 -0500 Subject: Btrfs: Fix checkpatch.pl warnings There were many, most are fixed now. struct-funcs.c generates some warnings but these are bogus. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ba484aac1b9c..c2aa33e3feb5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -311,7 +311,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, * to see if is references the subvolume where we are * placing this new snapshot. */ - while(1) { + while (1) { if (!test || dir == snap_src->fs_info->sb->s_root || test == snap_src->fs_info->sb->s_root || @@ -319,7 +319,8 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, break; } if (S_ISLNK(test->d_inode->i_mode)) { - printk("Symlink in snapshot path, failed\n"); + printk(KERN_INFO "Btrfs symlink in snapshot " + "path, failed\n"); error = -EMLINK; btrfs_free_path(path); goto out_drop_write; @@ -329,7 +330,8 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, path, test_oid, parent_oid); if (ret == 0) { - printk("Snapshot creation failed, looping\n"); + printk(KERN_INFO "Btrfs snapshot creation " + "failed, looping\n"); error = -EMLINK; btrfs_free_path(path); goto out_drop_write; @@ -617,7 +619,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, src_inode = src_file->f_path.dentry->d_inode; if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { - printk("btrfs: Snapshot src from another FS\n"); + printk(KERN_INFO "btrfs: Snapshot src from " + "another FS\n"); ret = -EINVAL; fput(src_file); goto out; @@ -810,9 +813,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ((off + len) & (bs-1))) goto out_unlock; - printk("final src extent is %llu~%llu\n", off, len); - printk("final dst extent is %llu~%llu\n", destoff, len); - /* do any pending delalloc/csum calc on src, one way or another, and lock file content */ while (1) { @@ -883,10 +883,13 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG) { - disko = btrfs_file_extent_disk_bytenr(leaf, extent); - diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + disko = btrfs_file_extent_disk_bytenr(leaf, + extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, + extent); datao = btrfs_file_extent_offset(leaf, extent); - datal = btrfs_file_extent_num_bytes(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, + extent); } else if (type == BTRFS_FILE_EXTENT_INLINE) { /* take upper bound, may be compressed */ datal = btrfs_file_extent_ram_bytes(leaf, @@ -916,8 +919,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - printk(" orig disk %llu~%llu data %llu~%llu\n", - disko, diskl, datao, datal); if (off > key.offset) { datao += off - key.offset; @@ -929,8 +930,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* disko == 0 means it's a hole */ if (!disko) datao = 0; - printk(" final disk %llu~%llu data %llu~%llu\n", - disko, diskl, datao, datal); btrfs_set_file_extent_offset(leaf, extent, datao); @@ -952,12 +951,11 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, skip = off - key.offset; new_key.offset += skip; } + if (key.offset + datal > off+len) trim = key.offset + datal - (off+len); - printk("len %lld skip %lld trim %lld\n", - datal, skip, trim); + if (comp && (skip || trim)) { - printk("btrfs clone_range can't split compressed inline extents yet\n"); ret = -EINVAL; goto out; } @@ -969,7 +967,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out; if (skip) { - u32 start = btrfs_file_extent_calc_inline_size(0); + u32 start = + btrfs_file_extent_calc_inline_size(0); memmove(buf+start, buf+start+skip, datal); } @@ -985,7 +984,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_mark_buffer_dirty(leaf); } - next: +next: btrfs_release_path(root, path); key.offset++; } -- cgit v1.2.2