diff options
author | Mark Fasheh <mfasheh@suse.de> | 2013-08-06 14:42:51 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@fusionio.com> | 2013-09-01 08:05:00 -0400 |
commit | 416161db9b63e353a8fb79d1369779175102fca1 (patch) | |
tree | 60627f898b85d3173e83222d12fc9f677e84730e /fs/btrfs/ioctl.c | |
parent | 4b384318a74e38eb248f74f9a92a700d2ce841f1 (diff) |
btrfs: offline dedupe
This patch adds an ioctl, BTRFS_IOC_FILE_EXTENT_SAME which will try to
de-duplicate a list of extents across a range of files.
Internally, the ioctl re-uses code from the clone ioctl. This avoids
rewriting a large chunk of extent handling code.
Userspace passes in an array of file, offset pairs along with a length
argument. The ioctl will then (for each dedupe) do a byte-by-byte comparison
of the user data before deduping the extent. Status and number of bytes
deduped are returned for each operation.
Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r-- | fs/btrfs/ioctl.c | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5b5148a1b0d3..022d8364e072 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/blkdev.h> | 43 | #include <linux/blkdev.h> |
44 | #include <linux/uuid.h> | 44 | #include <linux/uuid.h> |
45 | #include <linux/btrfs.h> | 45 | #include <linux/btrfs.h> |
46 | #include <linux/uaccess.h> | ||
46 | #include "compat.h" | 47 | #include "compat.h" |
47 | #include "ctree.h" | 48 | #include "ctree.h" |
48 | #include "disk-io.h" | 49 | #include "disk-io.h" |
@@ -57,6 +58,9 @@ | |||
57 | #include "send.h" | 58 | #include "send.h" |
58 | #include "dev-replace.h" | 59 | #include "dev-replace.h" |
59 | 60 | ||
61 | static int btrfs_clone(struct inode *src, struct inode *inode, | ||
62 | u64 off, u64 olen, u64 olen_aligned, u64 destoff); | ||
63 | |||
60 | /* Mask out flags that are inappropriate for the given type of inode. */ | 64 | /* Mask out flags that are inappropriate for the given type of inode. */ |
61 | static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) | 65 | static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) |
62 | { | 66 | { |
@@ -2470,6 +2474,34 @@ out: | |||
2470 | return ret; | 2474 | return ret; |
2471 | } | 2475 | } |
2472 | 2476 | ||
2477 | static struct page *extent_same_get_page(struct inode *inode, u64 off) | ||
2478 | { | ||
2479 | struct page *page; | ||
2480 | pgoff_t index; | ||
2481 | struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; | ||
2482 | |||
2483 | index = off >> PAGE_CACHE_SHIFT; | ||
2484 | |||
2485 | page = grab_cache_page(inode->i_mapping, index); | ||
2486 | if (!page) | ||
2487 | return NULL; | ||
2488 | |||
2489 | if (!PageUptodate(page)) { | ||
2490 | if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, | ||
2491 | 0)) | ||
2492 | return NULL; | ||
2493 | lock_page(page); | ||
2494 | if (!PageUptodate(page)) { | ||
2495 | unlock_page(page); | ||
2496 | page_cache_release(page); | ||
2497 | return NULL; | ||
2498 | } | ||
2499 | } | ||
2500 | unlock_page(page); | ||
2501 | |||
2502 | return page; | ||
2503 | } | ||
2504 | |||
2473 | static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) | 2505 | static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) |
2474 | { | 2506 | { |
2475 | /* do any pending delalloc/csum calc on src, one way or | 2507 | /* do any pending delalloc/csum calc on src, one way or |
@@ -2490,6 +2522,251 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) | |||
2490 | } | 2522 | } |
2491 | } | 2523 | } |
2492 | 2524 | ||
2525 | static void btrfs_double_unlock(struct inode *inode1, u64 loff1, | ||
2526 | struct inode *inode2, u64 loff2, u64 len) | ||
2527 | { | ||
2528 | unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); | ||
2529 | unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); | ||
2530 | |||
2531 | mutex_unlock(&inode1->i_mutex); | ||
2532 | mutex_unlock(&inode2->i_mutex); | ||
2533 | } | ||
2534 | |||
2535 | static void btrfs_double_lock(struct inode *inode1, u64 loff1, | ||
2536 | struct inode *inode2, u64 loff2, u64 len) | ||
2537 | { | ||
2538 | if (inode1 < inode2) { | ||
2539 | swap(inode1, inode2); | ||
2540 | swap(loff1, loff2); | ||
2541 | } | ||
2542 | |||
2543 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); | ||
2544 | lock_extent_range(inode1, loff1, len); | ||
2545 | if (inode1 != inode2) { | ||
2546 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); | ||
2547 | lock_extent_range(inode2, loff2, len); | ||
2548 | } | ||
2549 | } | ||
2550 | |||
2551 | static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, | ||
2552 | u64 dst_loff, u64 len) | ||
2553 | { | ||
2554 | int ret = 0; | ||
2555 | struct page *src_page, *dst_page; | ||
2556 | unsigned int cmp_len = PAGE_CACHE_SIZE; | ||
2557 | void *addr, *dst_addr; | ||
2558 | |||
2559 | while (len) { | ||
2560 | if (len < PAGE_CACHE_SIZE) | ||
2561 | cmp_len = len; | ||
2562 | |||
2563 | src_page = extent_same_get_page(src, loff); | ||
2564 | if (!src_page) | ||
2565 | return -EINVAL; | ||
2566 | dst_page = extent_same_get_page(dst, dst_loff); | ||
2567 | if (!dst_page) { | ||
2568 | page_cache_release(src_page); | ||
2569 | return -EINVAL; | ||
2570 | } | ||
2571 | addr = kmap_atomic(src_page); | ||
2572 | dst_addr = kmap_atomic(dst_page); | ||
2573 | |||
2574 | flush_dcache_page(src_page); | ||
2575 | flush_dcache_page(dst_page); | ||
2576 | |||
2577 | if (memcmp(addr, dst_addr, cmp_len)) | ||
2578 | ret = BTRFS_SAME_DATA_DIFFERS; | ||
2579 | |||
2580 | kunmap_atomic(addr); | ||
2581 | kunmap_atomic(dst_addr); | ||
2582 | page_cache_release(src_page); | ||
2583 | page_cache_release(dst_page); | ||
2584 | |||
2585 | if (ret) | ||
2586 | break; | ||
2587 | |||
2588 | loff += cmp_len; | ||
2589 | dst_loff += cmp_len; | ||
2590 | len -= cmp_len; | ||
2591 | } | ||
2592 | |||
2593 | return ret; | ||
2594 | } | ||
2595 | |||
2596 | static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) | ||
2597 | { | ||
2598 | u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; | ||
2599 | |||
2600 | if (off + len > inode->i_size || off + len < off) | ||
2601 | return -EINVAL; | ||
2602 | /* Check that we are block aligned - btrfs_clone() requires this */ | ||
2603 | if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) | ||
2604 | return -EINVAL; | ||
2605 | |||
2606 | return 0; | ||
2607 | } | ||
2608 | |||
2609 | static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, | ||
2610 | struct inode *dst, u64 dst_loff) | ||
2611 | { | ||
2612 | int ret; | ||
2613 | |||
2614 | /* | ||
2615 | * btrfs_clone() can't handle extents in the same file | ||
2616 | * yet. Once that works, we can drop this check and replace it | ||
2617 | * with a check for the same inode, but overlapping extents. | ||
2618 | */ | ||
2619 | if (src == dst) | ||
2620 | return -EINVAL; | ||
2621 | |||
2622 | btrfs_double_lock(src, loff, dst, dst_loff, len); | ||
2623 | |||
2624 | ret = extent_same_check_offsets(src, loff, len); | ||
2625 | if (ret) | ||
2626 | goto out_unlock; | ||
2627 | |||
2628 | ret = extent_same_check_offsets(dst, dst_loff, len); | ||
2629 | if (ret) | ||
2630 | goto out_unlock; | ||
2631 | |||
2632 | /* don't make the dst file partly checksummed */ | ||
2633 | if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != | ||
2634 | (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { | ||
2635 | ret = -EINVAL; | ||
2636 | goto out_unlock; | ||
2637 | } | ||
2638 | |||
2639 | ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); | ||
2640 | if (ret == 0) | ||
2641 | ret = btrfs_clone(src, dst, loff, len, len, dst_loff); | ||
2642 | |||
2643 | out_unlock: | ||
2644 | btrfs_double_unlock(src, loff, dst, dst_loff, len); | ||
2645 | |||
2646 | return ret; | ||
2647 | } | ||
2648 | |||
2649 | #define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) | ||
2650 | |||
2651 | static long btrfs_ioctl_file_extent_same(struct file *file, | ||
2652 | void __user *argp) | ||
2653 | { | ||
2654 | struct btrfs_ioctl_same_args *args = argp; | ||
2655 | struct btrfs_ioctl_same_args same; | ||
2656 | struct btrfs_ioctl_same_extent_info info; | ||
2657 | struct inode *src = file->f_dentry->d_inode; | ||
2658 | struct file *dst_file = NULL; | ||
2659 | struct inode *dst; | ||
2660 | u64 off; | ||
2661 | u64 len; | ||
2662 | int i; | ||
2663 | int ret; | ||
2664 | u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; | ||
2665 | bool is_admin = capable(CAP_SYS_ADMIN); | ||
2666 | |||
2667 | if (!(file->f_mode & FMODE_READ)) | ||
2668 | return -EINVAL; | ||
2669 | |||
2670 | ret = mnt_want_write_file(file); | ||
2671 | if (ret) | ||
2672 | return ret; | ||
2673 | |||
2674 | if (copy_from_user(&same, | ||
2675 | (struct btrfs_ioctl_same_args __user *)argp, | ||
2676 | sizeof(same))) { | ||
2677 | ret = -EFAULT; | ||
2678 | goto out; | ||
2679 | } | ||
2680 | |||
2681 | off = same.logical_offset; | ||
2682 | len = same.length; | ||
2683 | |||
2684 | /* | ||
2685 | * Limit the total length we will dedupe for each operation. | ||
2686 | * This is intended to bound the total time spent in this | ||
2687 | * ioctl to something sane. | ||
2688 | */ | ||
2689 | if (len > BTRFS_MAX_DEDUPE_LEN) | ||
2690 | len = BTRFS_MAX_DEDUPE_LEN; | ||
2691 | |||
2692 | if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { | ||
2693 | /* | ||
2694 | * Btrfs does not support blocksize < page_size. As a | ||
2695 | * result, btrfs_cmp_data() won't correctly handle | ||
2696 | * this situation without an update. | ||
2697 | */ | ||
2698 | ret = -EINVAL; | ||
2699 | goto out; | ||
2700 | } | ||
2701 | |||
2702 | ret = -EISDIR; | ||
2703 | if (S_ISDIR(src->i_mode)) | ||
2704 | goto out; | ||
2705 | |||
2706 | ret = -EACCES; | ||
2707 | if (!S_ISREG(src->i_mode)) | ||
2708 | goto out; | ||
2709 | |||
2710 | ret = 0; | ||
2711 | for (i = 0; i < same.dest_count; i++) { | ||
2712 | if (copy_from_user(&info, &args->info[i], sizeof(info))) { | ||
2713 | ret = -EFAULT; | ||
2714 | goto out; | ||
2715 | } | ||
2716 | |||
2717 | info.bytes_deduped = 0; | ||
2718 | |||
2719 | dst_file = fget(info.fd); | ||
2720 | if (!dst_file) { | ||
2721 | info.status = -EBADF; | ||
2722 | goto next; | ||
2723 | } | ||
2724 | |||
2725 | if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { | ||
2726 | info.status = -EINVAL; | ||
2727 | goto next; | ||
2728 | } | ||
2729 | |||
2730 | info.status = -EXDEV; | ||
2731 | if (file->f_path.mnt != dst_file->f_path.mnt) | ||
2732 | goto next; | ||
2733 | |||
2734 | dst = dst_file->f_dentry->d_inode; | ||
2735 | if (src->i_sb != dst->i_sb) | ||
2736 | goto next; | ||
2737 | |||
2738 | if (S_ISDIR(dst->i_mode)) { | ||
2739 | info.status = -EISDIR; | ||
2740 | goto next; | ||
2741 | } | ||
2742 | |||
2743 | if (!S_ISREG(dst->i_mode)) { | ||
2744 | info.status = -EACCES; | ||
2745 | goto next; | ||
2746 | } | ||
2747 | |||
2748 | info.status = btrfs_extent_same(src, off, len, dst, | ||
2749 | info.logical_offset); | ||
2750 | if (info.status == 0) | ||
2751 | info.bytes_deduped += len; | ||
2752 | |||
2753 | next: | ||
2754 | if (dst_file) | ||
2755 | fput(dst_file); | ||
2756 | |||
2757 | if (__put_user_unaligned(info.status, &args->info[i].status) || | ||
2758 | __put_user_unaligned(info.bytes_deduped, | ||
2759 | &args->info[i].bytes_deduped)) { | ||
2760 | ret = -EFAULT; | ||
2761 | goto out; | ||
2762 | } | ||
2763 | } | ||
2764 | |||
2765 | out: | ||
2766 | mnt_drop_write_file(file); | ||
2767 | return ret; | ||
2768 | } | ||
2769 | |||
2493 | /** | 2770 | /** |
2494 | * btrfs_clone() - clone a range from inode file to another | 2771 | * btrfs_clone() - clone a range from inode file to another |
2495 | * | 2772 | * |
@@ -4242,6 +4519,8 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
4242 | return btrfs_ioctl_get_fslabel(file, argp); | 4519 | return btrfs_ioctl_get_fslabel(file, argp); |
4243 | case BTRFS_IOC_SET_FSLABEL: | 4520 | case BTRFS_IOC_SET_FSLABEL: |
4244 | return btrfs_ioctl_set_fslabel(file, argp); | 4521 | return btrfs_ioctl_set_fslabel(file, argp); |
4522 | case BTRFS_IOC_FILE_EXTENT_SAME: | ||
4523 | return btrfs_ioctl_file_extent_same(file, argp); | ||
4245 | } | 4524 | } |
4246 | 4525 | ||
4247 | return -ENOTTY; | 4526 | return -ENOTTY; |