summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2016-11-09 17:13:11 -0500
committerDarrick J. Wong <darrick.wong@oracle.com>2016-12-10 15:39:45 -0500
commit29ac8e856cb3694e004037de595dec4ec53d42f2 (patch)
tree006df64270ecc128a4b837215c7ef78bd259ff51
parent86e59436d406d833a5da4a94aefb3c3be6b26053 (diff)
ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
Connect the new VFS clone_range, copy_range, and dedupe_range features to the existing reflink capability of ocfs2. Compared to the existing ocfs2 reflink ioctl We have to do things a little differently to support the VFS semantics (we can clone subranges of a file but we don't clone xattrs), but the VFS ioctls are more broadly supported. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- v2: Convert inline data files to extents files before reflinking, and fix i_blocks so that stat(2) output is correct. v3: Make zero-length dedupe consistent with btrfs behavior. v4: Use VFS double-inode lock routines and remove MAX_DEDUPE_LEN.
-rw-r--r--fs/ocfs2/file.c35
-rw-r--r--fs/ocfs2/file.h3
-rw-r--r--fs/ocfs2/refcounttree.c432
-rw-r--r--fs/ocfs2/refcounttree.h7
4 files changed, 474 insertions, 3 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d261f3a91870..c4889655d32b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
1667 *done = ret; 1667 *done = ret;
1668} 1668}
1669 1669
1670static int ocfs2_remove_inode_range(struct inode *inode, 1670int ocfs2_remove_inode_range(struct inode *inode,
1671 struct buffer_head *di_bh, u64 byte_start, 1671 struct buffer_head *di_bh, u64 byte_start,
1672 u64 byte_len) 1672 u64 byte_len)
1673{ 1673{
1674 int ret = 0, flags = 0, done = 0, i; 1674 int ret = 0, flags = 0, done = 0, i;
1675 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; 1675 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
@@ -2439,6 +2439,31 @@ out:
2439 return offset; 2439 return offset;
2440} 2440}
2441 2441
2442static int ocfs2_file_clone_range(struct file *file_in,
2443 loff_t pos_in,
2444 struct file *file_out,
2445 loff_t pos_out,
2446 u64 len)
2447{
2448 return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
2449 len, false);
2450}
2451
2452static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
2453 u64 loff,
2454 u64 len,
2455 struct file *dst_file,
2456 u64 dst_loff)
2457{
2458 int error;
2459
2460 error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
2461 len, true);
2462 if (error)
2463 return error;
2464 return len;
2465}
2466
2442const struct inode_operations ocfs2_file_iops = { 2467const struct inode_operations ocfs2_file_iops = {
2443 .setattr = ocfs2_setattr, 2468 .setattr = ocfs2_setattr,
2444 .getattr = ocfs2_getattr, 2469 .getattr = ocfs2_getattr,
@@ -2478,6 +2503,8 @@ const struct file_operations ocfs2_fops = {
2478 .splice_read = generic_file_splice_read, 2503 .splice_read = generic_file_splice_read,
2479 .splice_write = iter_file_splice_write, 2504 .splice_write = iter_file_splice_write,
2480 .fallocate = ocfs2_fallocate, 2505 .fallocate = ocfs2_fallocate,
2506 .clone_file_range = ocfs2_file_clone_range,
2507 .dedupe_file_range = ocfs2_file_dedupe_range,
2481}; 2508};
2482 2509
2483const struct file_operations ocfs2_dops = { 2510const struct file_operations ocfs2_dops = {
@@ -2523,6 +2550,8 @@ const struct file_operations ocfs2_fops_no_plocks = {
2523 .splice_read = generic_file_splice_read, 2550 .splice_read = generic_file_splice_read,
2524 .splice_write = iter_file_splice_write, 2551 .splice_write = iter_file_splice_write,
2525 .fallocate = ocfs2_fallocate, 2552 .fallocate = ocfs2_fallocate,
2553 .clone_file_range = ocfs2_file_clone_range,
2554 .dedupe_file_range = ocfs2_file_dedupe_range,
2526}; 2555};
2527 2556
2528const struct file_operations ocfs2_dops_no_plocks = { 2557const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e8c62f22215c..897fd9a2e51d 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
82 82
83int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, 83int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
84 size_t count); 84 size_t count);
85int ocfs2_remove_inode_range(struct inode *inode,
86 struct buffer_head *di_bh, u64 byte_start,
87 u64 byte_len);
85#endif /* OCFS2_FILE_H */ 88#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index dc8089af9ddf..b18465e330b1 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -34,6 +34,7 @@
34#include "xattr.h" 34#include "xattr.h"
35#include "namei.h" 35#include "namei.h"
36#include "ocfs2_trace.h" 36#include "ocfs2_trace.h"
37#include "file.h"
37 38
38#include <linux/bio.h> 39#include <linux/bio.h>
39#include <linux/blkdev.h> 40#include <linux/blkdev.h>
@@ -4448,3 +4449,434 @@ out:
4448 4449
4449 return error; 4450 return error;
4450} 4451}
4452
4453/* Update destination inode size, if necessary. */
4454static int ocfs2_reflink_update_dest(struct inode *dest,
4455 struct buffer_head *d_bh,
4456 loff_t newlen)
4457{
4458 handle_t *handle;
4459 int ret;
4460
4461 dest->i_blocks = ocfs2_inode_sector_count(dest);
4462
4463 if (newlen <= i_size_read(dest))
4464 return 0;
4465
4466 handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
4467 OCFS2_INODE_UPDATE_CREDITS);
4468 if (IS_ERR(handle)) {
4469 ret = PTR_ERR(handle);
4470 mlog_errno(ret);
4471 return ret;
4472 }
4473
4474 /* Extend i_size if needed. */
4475 spin_lock(&OCFS2_I(dest)->ip_lock);
4476 if (newlen > i_size_read(dest))
4477 i_size_write(dest, newlen);
4478 spin_unlock(&OCFS2_I(dest)->ip_lock);
4479 dest->i_ctime = dest->i_mtime = current_time(dest);
4480
4481 ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
4482 if (ret) {
4483 mlog_errno(ret);
4484 goto out_commit;
4485 }
4486
4487out_commit:
4488 ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
4489 return ret;
4490}
4491
4492/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
4493static int ocfs2_reflink_remap_extent(struct inode *s_inode,
4494 struct buffer_head *s_bh,
4495 loff_t pos_in,
4496 struct inode *t_inode,
4497 struct buffer_head *t_bh,
4498 loff_t pos_out,
4499 loff_t len,
4500 struct ocfs2_cached_dealloc_ctxt *dealloc)
4501{
4502 struct ocfs2_extent_tree s_et;
4503 struct ocfs2_extent_tree t_et;
4504 struct ocfs2_dinode *dis;
4505 struct buffer_head *ref_root_bh = NULL;
4506 struct ocfs2_refcount_tree *ref_tree;
4507 struct ocfs2_super *osb;
4508 loff_t pstart, plen;
4509 u32 p_cluster, num_clusters, slast, spos, tpos;
4510 unsigned int ext_flags;
4511 int ret = 0;
4512
4513 osb = OCFS2_SB(s_inode->i_sb);
4514 dis = (struct ocfs2_dinode *)s_bh->b_data;
4515 ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
4516 ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
4517
4518 spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
4519 tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
4520 slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
4521
4522 while (spos < slast) {
4523 if (fatal_signal_pending(current)) {
4524 ret = -EINTR;
4525 goto out;
4526 }
4527
4528 /* Look up the extent. */
4529 ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
4530 &num_clusters, &ext_flags);
4531 if (ret) {
4532 mlog_errno(ret);
4533 goto out;
4534 }
4535
4536 num_clusters = min_t(u32, num_clusters, slast - spos);
4537
4538 /* Punch out the dest range. */
4539 pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
4540 plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
4541 ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
4542 if (ret) {
4543 mlog_errno(ret);
4544 goto out;
4545 }
4546
4547 if (p_cluster == 0)
4548 goto next_loop;
4549
4550 /* Lock the refcount btree... */
4551 ret = ocfs2_lock_refcount_tree(osb,
4552 le64_to_cpu(dis->i_refcount_loc),
4553 1, &ref_tree, &ref_root_bh);
4554 if (ret) {
4555 mlog_errno(ret);
4556 goto out;
4557 }
4558
4559 /* Mark s_inode's extent as refcounted. */
4560 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
4561 ret = ocfs2_add_refcount_flag(s_inode, &s_et,
4562 &ref_tree->rf_ci,
4563 ref_root_bh, spos,
4564 p_cluster, num_clusters,
4565 dealloc, NULL);
4566 if (ret) {
4567 mlog_errno(ret);
4568 goto out_unlock_refcount;
4569 }
4570 }
4571
4572 /* Map in the new extent. */
4573 ext_flags |= OCFS2_EXT_REFCOUNTED;
4574 ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
4575 &ref_tree->rf_ci,
4576 ref_root_bh,
4577 tpos, p_cluster,
4578 num_clusters,
4579 ext_flags,
4580 dealloc);
4581 if (ret) {
4582 mlog_errno(ret);
4583 goto out_unlock_refcount;
4584 }
4585
4586 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4587 brelse(ref_root_bh);
4588next_loop:
4589 spos += num_clusters;
4590 tpos += num_clusters;
4591 }
4592
4593out:
4594 return ret;
4595out_unlock_refcount:
4596 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4597 brelse(ref_root_bh);
4598 return ret;
4599}
4600
4601/* Set up refcount tree and remap s_inode to t_inode. */
4602static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
4603 struct buffer_head *s_bh,
4604 loff_t pos_in,
4605 struct inode *t_inode,
4606 struct buffer_head *t_bh,
4607 loff_t pos_out,
4608 loff_t len)
4609{
4610 struct ocfs2_cached_dealloc_ctxt dealloc;
4611 struct ocfs2_super *osb;
4612 struct ocfs2_dinode *dis;
4613 struct ocfs2_dinode *dit;
4614 int ret;
4615
4616 osb = OCFS2_SB(s_inode->i_sb);
4617 dis = (struct ocfs2_dinode *)s_bh->b_data;
4618 dit = (struct ocfs2_dinode *)t_bh->b_data;
4619 ocfs2_init_dealloc_ctxt(&dealloc);
4620
4621 /*
4622 * If we're reflinking the entire file and the source is inline
4623 * data, just copy the contents.
4624 */
4625 if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
4626 i_size_read(t_inode) <= len &&
4627 (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
4628 ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
4629 if (ret)
4630 mlog_errno(ret);
4631 goto out;
4632 }
4633
4634 /*
4635 * If both inodes belong to two different refcount groups then
4636 * forget it because we don't know how (or want) to go merging
4637 * refcount trees.
4638 */
4639 ret = -EOPNOTSUPP;
4640 if (ocfs2_is_refcount_inode(s_inode) &&
4641 ocfs2_is_refcount_inode(t_inode) &&
4642 le64_to_cpu(dis->i_refcount_loc) !=
4643 le64_to_cpu(dit->i_refcount_loc))
4644 goto out;
4645
4646 /* Neither inode has a refcount tree. Add one to s_inode. */
4647 if (!ocfs2_is_refcount_inode(s_inode) &&
4648 !ocfs2_is_refcount_inode(t_inode)) {
4649 ret = ocfs2_create_refcount_tree(s_inode, s_bh);
4650 if (ret) {
4651 mlog_errno(ret);
4652 goto out;
4653 }
4654 }
4655
4656 /* Ensure that both inodes end up with the same refcount tree. */
4657 if (!ocfs2_is_refcount_inode(s_inode)) {
4658 ret = ocfs2_set_refcount_tree(s_inode, s_bh,
4659 le64_to_cpu(dit->i_refcount_loc));
4660 if (ret) {
4661 mlog_errno(ret);
4662 goto out;
4663 }
4664 }
4665 if (!ocfs2_is_refcount_inode(t_inode)) {
4666 ret = ocfs2_set_refcount_tree(t_inode, t_bh,
4667 le64_to_cpu(dis->i_refcount_loc));
4668 if (ret) {
4669 mlog_errno(ret);
4670 goto out;
4671 }
4672 }
4673
4674 /* Turn off inline data in the dest file. */
4675 if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4676 ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
4677 if (ret) {
4678 mlog_errno(ret);
4679 goto out;
4680 }
4681 }
4682
4683 /* Actually remap extents now. */
4684 ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
4685 pos_out, len, &dealloc);
4686 if (ret) {
4687 mlog_errno(ret);
4688 goto out;
4689 }
4690
4691out:
4692 if (ocfs2_dealloc_has_cluster(&dealloc)) {
4693 ocfs2_schedule_truncate_log_flush(osb, 1);
4694 ocfs2_run_deallocs(osb, &dealloc);
4695 }
4696
4697 return ret;
4698}
4699
4700/* Lock an inode and grab a bh pointing to the inode. */
4701static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
4702 struct buffer_head **bh1,
4703 struct inode *t_inode,
4704 struct buffer_head **bh2)
4705{
4706 struct inode *inode1;
4707 struct inode *inode2;
4708 struct ocfs2_inode_info *oi1;
4709 struct ocfs2_inode_info *oi2;
4710 bool same_inode = (s_inode == t_inode);
4711 int status;
4712
4713 /* First grab the VFS and rw locks. */
4714 lock_two_nondirectories(s_inode, t_inode);
4715 inode1 = s_inode;
4716 inode2 = t_inode;
4717 if (inode1->i_ino > inode2->i_ino)
4718 swap(inode1, inode2);
4719
4720 status = ocfs2_rw_lock(inode1, 1);
4721 if (status) {
4722 mlog_errno(status);
4723 goto out_i1;
4724 }
4725 if (!same_inode) {
4726 status = ocfs2_rw_lock(inode2, 1);
4727 if (status) {
4728 mlog_errno(status);
4729 goto out_i2;
4730 }
4731 }
4732
4733 /* Now go for the cluster locks */
4734 oi1 = OCFS2_I(inode1);
4735 oi2 = OCFS2_I(inode2);
4736
4737 trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
4738 (unsigned long long)oi2->ip_blkno);
4739
4740 if (*bh1)
4741 *bh1 = NULL;
4742 if (*bh2)
4743 *bh2 = NULL;
4744
4745 /* We always want to lock the one with the lower lockid first. */
4746 if (oi1->ip_blkno > oi2->ip_blkno)
4747 mlog_errno(-ENOLCK);
4748
4749 /* lock id1 */
4750 status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
4751 if (status < 0) {
4752 if (status != -ENOENT)
4753 mlog_errno(status);
4754 goto out_rw2;
4755 }
4756
4757 /* lock id2 */
4758 if (!same_inode) {
4759 status = ocfs2_inode_lock_nested(inode2, bh2, 1,
4760 OI_LS_REFLINK_TARGET);
4761 if (status < 0) {
4762 if (status != -ENOENT)
4763 mlog_errno(status);
4764 goto out_cl1;
4765 }
4766 } else
4767 *bh2 = *bh1;
4768
4769 trace_ocfs2_double_lock_end(
4770 (unsigned long long)OCFS2_I(inode1)->ip_blkno,
4771 (unsigned long long)OCFS2_I(inode2)->ip_blkno);
4772
4773 return 0;
4774
4775out_cl1:
4776 ocfs2_inode_unlock(inode1, 1);
4777 brelse(*bh1);
4778 *bh1 = NULL;
4779out_rw2:
4780 ocfs2_rw_unlock(inode2, 1);
4781out_i2:
4782 ocfs2_rw_unlock(inode1, 1);
4783out_i1:
4784 unlock_two_nondirectories(s_inode, t_inode);
4785 return status;
4786}
4787
4788/* Unlock both inodes and release buffers. */
4789static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
4790 struct buffer_head *s_bh,
4791 struct inode *t_inode,
4792 struct buffer_head *t_bh)
4793{
4794 ocfs2_inode_unlock(s_inode, 1);
4795 ocfs2_rw_unlock(s_inode, 1);
4796 brelse(s_bh);
4797 if (s_inode != t_inode) {
4798 ocfs2_inode_unlock(t_inode, 1);
4799 ocfs2_rw_unlock(t_inode, 1);
4800 brelse(t_bh);
4801 }
4802 unlock_two_nondirectories(s_inode, t_inode);
4803}
4804
4805/* Link a range of blocks from one file to another. */
4806int ocfs2_reflink_remap_range(struct file *file_in,
4807 loff_t pos_in,
4808 struct file *file_out,
4809 loff_t pos_out,
4810 u64 len,
4811 bool is_dedupe)
4812{
4813 struct inode *inode_in = file_inode(file_in);
4814 struct inode *inode_out = file_inode(file_out);
4815 struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
4816 struct buffer_head *in_bh = NULL, *out_bh = NULL;
4817 bool same_inode = (inode_in == inode_out);
4818 ssize_t ret;
4819
4820 if (!ocfs2_refcount_tree(osb))
4821 return -EOPNOTSUPP;
4822 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
4823 return -EROFS;
4824
4825 /* Lock both files against IO */
4826 ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
4827 if (ret)
4828 return ret;
4829
4830 /* Check file eligibility and prepare for block sharing. */
4831 ret = -EINVAL;
4832 if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
4833 (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
4834 goto out_unlock;
4835
4836 ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
4837 &len, is_dedupe);
4838 if (ret || len == 0)
4839 goto out_unlock;
4840
4841 /* Lock out changes to the allocation maps and remap. */
4842 down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
4843 if (!same_inode)
4844 down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
4845 SINGLE_DEPTH_NESTING);
4846
4847 ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
4848 out_bh, pos_out, len);
4849
4850 /* Zap any page cache for the destination file's range. */
4851 if (!ret)
4852 truncate_inode_pages_range(&inode_out->i_data, pos_out,
4853 PAGE_ALIGN(pos_out + len) - 1);
4854
4855 up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
4856 if (!same_inode)
4857 up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
4858 if (ret) {
4859 mlog_errno(ret);
4860 goto out_unlock;
4861 }
4862
4863 /*
4864 * Empty the extent map so that we may get the right extent
4865 * record from the disk.
4866 */
4867 ocfs2_extent_map_trunc(inode_in, 0);
4868 ocfs2_extent_map_trunc(inode_out, 0);
4869
4870 ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
4871 if (ret) {
4872 mlog_errno(ret);
4873 goto out_unlock;
4874 }
4875
4876 ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
4877 return 0;
4878
4879out_unlock:
4880 ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
4881 return ret;
4882}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 6422bbcdb525..4af55bf4b35b 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -115,4 +115,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
115 const char __user *oldname, 115 const char __user *oldname,
116 const char __user *newname, 116 const char __user *newname,
117 bool preserve); 117 bool preserve);
118int ocfs2_reflink_remap_range(struct file *file_in,
119 loff_t pos_in,
120 struct file *file_out,
121 loff_t pos_out,
122 u64 len,
123 bool is_dedupe);
124
118#endif /* OCFS2_REFCOUNTTREE_H */ 125#endif /* OCFS2_REFCOUNTTREE_H */