aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ioctl.c
diff options
context:
space:
mode:
authorMark Fasheh <mfasheh@suse.de>2013-08-06 14:42:51 -0400
committerChris Mason <chris.mason@fusionio.com>2013-09-01 08:05:00 -0400
commit416161db9b63e353a8fb79d1369779175102fca1 (patch)
tree60627f898b85d3173e83222d12fc9f677e84730e /fs/btrfs/ioctl.c
parent4b384318a74e38eb248f74f9a92a700d2ce841f1 (diff)
btrfs: offline dedupe
This patch adds an ioctl, BTRFS_IOC_FILE_EXTENT_SAME which will try to de-duplicate a list of extents across a range of files. Internally, the ioctl re-uses code from the clone ioctl. This avoids rewriting a large chunk of extent handling code. Userspace passes in an array of file, offset pairs along with a length argument. The ioctl will then (for each dedupe) do a byte-by-byte comparison of the user data before deduping the extent. Status and number of bytes deduped are returned for each operation. Signed-off-by: Mark Fasheh <mfasheh@suse.de> Reviewed-by: Zach Brown <zab@redhat.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r--fs/btrfs/ioctl.c279
1 files changed, 279 insertions, 0 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5b5148a1b0d3..022d8364e072 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -43,6 +43,7 @@
43#include <linux/blkdev.h> 43#include <linux/blkdev.h>
44#include <linux/uuid.h> 44#include <linux/uuid.h>
45#include <linux/btrfs.h> 45#include <linux/btrfs.h>
46#include <linux/uaccess.h>
46#include "compat.h" 47#include "compat.h"
47#include "ctree.h" 48#include "ctree.h"
48#include "disk-io.h" 49#include "disk-io.h"
@@ -57,6 +58,9 @@
57#include "send.h" 58#include "send.h"
58#include "dev-replace.h" 59#include "dev-replace.h"
59 60
61static int btrfs_clone(struct inode *src, struct inode *inode,
62 u64 off, u64 olen, u64 olen_aligned, u64 destoff);
63
60/* Mask out flags that are inappropriate for the given type of inode. */ 64/* Mask out flags that are inappropriate for the given type of inode. */
61static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 65static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
62{ 66{
@@ -2470,6 +2474,34 @@ out:
2470 return ret; 2474 return ret;
2471} 2475}
2472 2476
2477static struct page *extent_same_get_page(struct inode *inode, u64 off)
2478{
2479 struct page *page;
2480 pgoff_t index;
2481 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2482
2483 index = off >> PAGE_CACHE_SHIFT;
2484
2485 page = grab_cache_page(inode->i_mapping, index);
2486 if (!page)
2487 return NULL;
2488
2489 if (!PageUptodate(page)) {
2490 if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
2491 0))
2492 return NULL;
2493 lock_page(page);
2494 if (!PageUptodate(page)) {
2495 unlock_page(page);
2496 page_cache_release(page);
2497 return NULL;
2498 }
2499 }
2500 unlock_page(page);
2501
2502 return page;
2503}
2504
2473static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) 2505static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
2474{ 2506{
2475 /* do any pending delalloc/csum calc on src, one way or 2507 /* do any pending delalloc/csum calc on src, one way or
@@ -2490,6 +2522,251 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
2490 } 2522 }
2491} 2523}
2492 2524
2525static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
2526 struct inode *inode2, u64 loff2, u64 len)
2527{
2528 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
2529 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
2530
2531 mutex_unlock(&inode1->i_mutex);
2532 mutex_unlock(&inode2->i_mutex);
2533}
2534
2535static void btrfs_double_lock(struct inode *inode1, u64 loff1,
2536 struct inode *inode2, u64 loff2, u64 len)
2537{
2538 if (inode1 < inode2) {
2539 swap(inode1, inode2);
2540 swap(loff1, loff2);
2541 }
2542
2543 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
2544 lock_extent_range(inode1, loff1, len);
2545 if (inode1 != inode2) {
2546 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
2547 lock_extent_range(inode2, loff2, len);
2548 }
2549}
2550
2551static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
2552 u64 dst_loff, u64 len)
2553{
2554 int ret = 0;
2555 struct page *src_page, *dst_page;
2556 unsigned int cmp_len = PAGE_CACHE_SIZE;
2557 void *addr, *dst_addr;
2558
2559 while (len) {
2560 if (len < PAGE_CACHE_SIZE)
2561 cmp_len = len;
2562
2563 src_page = extent_same_get_page(src, loff);
2564 if (!src_page)
2565 return -EINVAL;
2566 dst_page = extent_same_get_page(dst, dst_loff);
2567 if (!dst_page) {
2568 page_cache_release(src_page);
2569 return -EINVAL;
2570 }
2571 addr = kmap_atomic(src_page);
2572 dst_addr = kmap_atomic(dst_page);
2573
2574 flush_dcache_page(src_page);
2575 flush_dcache_page(dst_page);
2576
2577 if (memcmp(addr, dst_addr, cmp_len))
2578 ret = BTRFS_SAME_DATA_DIFFERS;
2579
2580 kunmap_atomic(addr);
2581 kunmap_atomic(dst_addr);
2582 page_cache_release(src_page);
2583 page_cache_release(dst_page);
2584
2585 if (ret)
2586 break;
2587
2588 loff += cmp_len;
2589 dst_loff += cmp_len;
2590 len -= cmp_len;
2591 }
2592
2593 return ret;
2594}
2595
2596static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
2597{
2598 u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
2599
2600 if (off + len > inode->i_size || off + len < off)
2601 return -EINVAL;
2602 /* Check that we are block aligned - btrfs_clone() requires this */
2603 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
2604 return -EINVAL;
2605
2606 return 0;
2607}
2608
2609static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
2610 struct inode *dst, u64 dst_loff)
2611{
2612 int ret;
2613
2614 /*
2615 * btrfs_clone() can't handle extents in the same file
2616 * yet. Once that works, we can drop this check and replace it
2617 * with a check for the same inode, but overlapping extents.
2618 */
2619 if (src == dst)
2620 return -EINVAL;
2621
2622 btrfs_double_lock(src, loff, dst, dst_loff, len);
2623
2624 ret = extent_same_check_offsets(src, loff, len);
2625 if (ret)
2626 goto out_unlock;
2627
2628 ret = extent_same_check_offsets(dst, dst_loff, len);
2629 if (ret)
2630 goto out_unlock;
2631
2632 /* don't make the dst file partly checksummed */
2633 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
2634 (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
2635 ret = -EINVAL;
2636 goto out_unlock;
2637 }
2638
2639 ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
2640 if (ret == 0)
2641 ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
2642
2643out_unlock:
2644 btrfs_double_unlock(src, loff, dst, dst_loff, len);
2645
2646 return ret;
2647}
2648
2649#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
2650
2651static long btrfs_ioctl_file_extent_same(struct file *file,
2652 void __user *argp)
2653{
2654 struct btrfs_ioctl_same_args *args = argp;
2655 struct btrfs_ioctl_same_args same;
2656 struct btrfs_ioctl_same_extent_info info;
2657 struct inode *src = file->f_dentry->d_inode;
2658 struct file *dst_file = NULL;
2659 struct inode *dst;
2660 u64 off;
2661 u64 len;
2662 int i;
2663 int ret;
2664 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
2665 bool is_admin = capable(CAP_SYS_ADMIN);
2666
2667 if (!(file->f_mode & FMODE_READ))
2668 return -EINVAL;
2669
2670 ret = mnt_want_write_file(file);
2671 if (ret)
2672 return ret;
2673
2674 if (copy_from_user(&same,
2675 (struct btrfs_ioctl_same_args __user *)argp,
2676 sizeof(same))) {
2677 ret = -EFAULT;
2678 goto out;
2679 }
2680
2681 off = same.logical_offset;
2682 len = same.length;
2683
2684 /*
2685 * Limit the total length we will dedupe for each operation.
2686 * This is intended to bound the total time spent in this
2687 * ioctl to something sane.
2688 */
2689 if (len > BTRFS_MAX_DEDUPE_LEN)
2690 len = BTRFS_MAX_DEDUPE_LEN;
2691
2692 if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
2693 /*
2694 * Btrfs does not support blocksize < page_size. As a
2695 * result, btrfs_cmp_data() won't correctly handle
2696 * this situation without an update.
2697 */
2698 ret = -EINVAL;
2699 goto out;
2700 }
2701
2702 ret = -EISDIR;
2703 if (S_ISDIR(src->i_mode))
2704 goto out;
2705
2706 ret = -EACCES;
2707 if (!S_ISREG(src->i_mode))
2708 goto out;
2709
2710 ret = 0;
2711 for (i = 0; i < same.dest_count; i++) {
2712 if (copy_from_user(&info, &args->info[i], sizeof(info))) {
2713 ret = -EFAULT;
2714 goto out;
2715 }
2716
2717 info.bytes_deduped = 0;
2718
2719 dst_file = fget(info.fd);
2720 if (!dst_file) {
2721 info.status = -EBADF;
2722 goto next;
2723 }
2724
2725 if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
2726 info.status = -EINVAL;
2727 goto next;
2728 }
2729
2730 info.status = -EXDEV;
2731 if (file->f_path.mnt != dst_file->f_path.mnt)
2732 goto next;
2733
2734 dst = dst_file->f_dentry->d_inode;
2735 if (src->i_sb != dst->i_sb)
2736 goto next;
2737
2738 if (S_ISDIR(dst->i_mode)) {
2739 info.status = -EISDIR;
2740 goto next;
2741 }
2742
2743 if (!S_ISREG(dst->i_mode)) {
2744 info.status = -EACCES;
2745 goto next;
2746 }
2747
2748 info.status = btrfs_extent_same(src, off, len, dst,
2749 info.logical_offset);
2750 if (info.status == 0)
2751 info.bytes_deduped += len;
2752
2753next:
2754 if (dst_file)
2755 fput(dst_file);
2756
2757 if (__put_user_unaligned(info.status, &args->info[i].status) ||
2758 __put_user_unaligned(info.bytes_deduped,
2759 &args->info[i].bytes_deduped)) {
2760 ret = -EFAULT;
2761 goto out;
2762 }
2763 }
2764
2765out:
2766 mnt_drop_write_file(file);
2767 return ret;
2768}
2769
2493/** 2770/**
2494 * btrfs_clone() - clone a range from inode file to another 2771 * btrfs_clone() - clone a range from inode file to another
2495 * 2772 *
@@ -4242,6 +4519,8 @@ long btrfs_ioctl(struct file *file, unsigned int
4242 return btrfs_ioctl_get_fslabel(file, argp); 4519 return btrfs_ioctl_get_fslabel(file, argp);
4243 case BTRFS_IOC_SET_FSLABEL: 4520 case BTRFS_IOC_SET_FSLABEL:
4244 return btrfs_ioctl_set_fslabel(file, argp); 4521 return btrfs_ioctl_set_fslabel(file, argp);
4522 case BTRFS_IOC_FILE_EXTENT_SAME:
4523 return btrfs_ioctl_file_extent_same(file, argp);
4245 } 4524 }
4246 4525
4247 return -ENOTTY; 4526 return -ENOTTY;