aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/ioctl.c624
1 files changed, 133 insertions, 491 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 410c7e007ba8..fab9443f6a42 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3206,92 +3206,6 @@ out:
3206 return ret; 3206 return ret;
3207} 3207}
3208 3208
3209static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
3210{
3211 struct page *page;
3212
3213 page = grab_cache_page(inode->i_mapping, index);
3214 if (!page)
3215 return ERR_PTR(-ENOMEM);
3216
3217 if (!PageUptodate(page)) {
3218 int ret;
3219
3220 ret = btrfs_readpage(NULL, page);
3221 if (ret)
3222 return ERR_PTR(ret);
3223 lock_page(page);
3224 if (!PageUptodate(page)) {
3225 unlock_page(page);
3226 put_page(page);
3227 return ERR_PTR(-EIO);
3228 }
3229 if (page->mapping != inode->i_mapping) {
3230 unlock_page(page);
3231 put_page(page);
3232 return ERR_PTR(-EAGAIN);
3233 }
3234 }
3235
3236 return page;
3237}
3238
3239static int gather_extent_pages(struct inode *inode, struct page **pages,
3240 int num_pages, u64 off)
3241{
3242 int i;
3243 pgoff_t index = off >> PAGE_SHIFT;
3244
3245 for (i = 0; i < num_pages; i++) {
3246again:
3247 pages[i] = extent_same_get_page(inode, index + i);
3248 if (IS_ERR(pages[i])) {
3249 int err = PTR_ERR(pages[i]);
3250
3251 if (err == -EAGAIN)
3252 goto again;
3253 pages[i] = NULL;
3254 return err;
3255 }
3256 }
3257 return 0;
3258}
3259
3260static int lock_extent_range(struct inode *inode, u64 off, u64 len,
3261 bool retry_range_locking)
3262{
3263 /*
3264 * Do any pending delalloc/csum calculations on inode, one way or
3265 * another, and lock file content.
3266 * The locking order is:
3267 *
3268 * 1) pages
3269 * 2) range in the inode's io tree
3270 */
3271 while (1) {
3272 struct btrfs_ordered_extent *ordered;
3273 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
3274 ordered = btrfs_lookup_first_ordered_extent(inode,
3275 off + len - 1);
3276 if ((!ordered ||
3277 ordered->file_offset + ordered->len <= off ||
3278 ordered->file_offset >= off + len) &&
3279 !test_range_bit(&BTRFS_I(inode)->io_tree, off,
3280 off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
3281 if (ordered)
3282 btrfs_put_ordered_extent(ordered);
3283 break;
3284 }
3285 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
3286 if (ordered)
3287 btrfs_put_ordered_extent(ordered);
3288 if (!retry_range_locking)
3289 return -EAGAIN;
3290 btrfs_wait_ordered_range(inode, off, len);
3291 }
3292 return 0;
3293}
3294
3295static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) 3209static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
3296{ 3210{
3297 inode_unlock(inode1); 3211 inode_unlock(inode1);
@@ -3307,261 +3221,32 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
3307 inode_lock_nested(inode2, I_MUTEX_CHILD); 3221 inode_lock_nested(inode2, I_MUTEX_CHILD);
3308} 3222}
3309 3223
3310static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
3311 struct inode *inode2, u64 loff2, u64 len)
3312{
3313 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
3314 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
3315}
3316
3317static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
3318 struct inode *inode2, u64 loff2, u64 len,
3319 bool retry_range_locking)
3320{
3321 int ret;
3322
3323 if (inode1 < inode2) {
3324 swap(inode1, inode2);
3325 swap(loff1, loff2);
3326 }
3327 ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
3328 if (ret)
3329 return ret;
3330 ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
3331 if (ret)
3332 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
3333 loff1 + len - 1);
3334 return ret;
3335}
3336
3337struct cmp_pages {
3338 int num_pages;
3339 struct page **src_pages;
3340 struct page **dst_pages;
3341};
3342
3343static void btrfs_cmp_data_free(struct cmp_pages *cmp)
3344{
3345 int i;
3346 struct page *pg;
3347
3348 for (i = 0; i < cmp->num_pages; i++) {
3349 pg = cmp->src_pages[i];
3350 if (pg) {
3351 unlock_page(pg);
3352 put_page(pg);
3353 cmp->src_pages[i] = NULL;
3354 }
3355 pg = cmp->dst_pages[i];
3356 if (pg) {
3357 unlock_page(pg);
3358 put_page(pg);
3359 cmp->dst_pages[i] = NULL;
3360 }
3361 }
3362}
3363
3364static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
3365 struct inode *dst, u64 dst_loff,
3366 u64 len, struct cmp_pages *cmp)
3367{
3368 int ret;
3369 int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
3370
3371 cmp->num_pages = num_pages;
3372
3373 ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
3374 if (ret)
3375 goto out;
3376
3377 ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
3378
3379out:
3380 if (ret)
3381 btrfs_cmp_data_free(cmp);
3382 return ret;
3383}
3384
3385static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
3386{
3387 int ret = 0;
3388 int i;
3389 struct page *src_page, *dst_page;
3390 unsigned int cmp_len = PAGE_SIZE;
3391 void *addr, *dst_addr;
3392
3393 i = 0;
3394 while (len) {
3395 if (len < PAGE_SIZE)
3396 cmp_len = len;
3397
3398 BUG_ON(i >= cmp->num_pages);
3399
3400 src_page = cmp->src_pages[i];
3401 dst_page = cmp->dst_pages[i];
3402 ASSERT(PageLocked(src_page));
3403 ASSERT(PageLocked(dst_page));
3404
3405 addr = kmap_atomic(src_page);
3406 dst_addr = kmap_atomic(dst_page);
3407
3408 flush_dcache_page(src_page);
3409 flush_dcache_page(dst_page);
3410
3411 if (memcmp(addr, dst_addr, cmp_len))
3412 ret = -EBADE;
3413
3414 kunmap_atomic(addr);
3415 kunmap_atomic(dst_addr);
3416
3417 if (ret)
3418 break;
3419
3420 len -= cmp_len;
3421 i++;
3422 }
3423
3424 return ret;
3425}
3426
3427static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
3428 u64 olen)
3429{
3430 u64 len = *plen;
3431 u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
3432
3433 if (off + olen > inode->i_size || off + olen < off)
3434 return -EINVAL;
3435
3436 /* if we extend to eof, continue to block boundary */
3437 if (off + len == inode->i_size)
3438 *plen = len = ALIGN(inode->i_size, bs) - off;
3439
3440 /* Check that we are block aligned - btrfs_clone() requires this */
3441 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
3442 return -EINVAL;
3443
3444 return 0;
3445}
3446
3447static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, 3224static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
3448 struct inode *dst, u64 dst_loff, 3225 struct inode *dst, u64 dst_loff)
3449 struct cmp_pages *cmp)
3450{ 3226{
3227 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
3451 int ret; 3228 int ret;
3452 u64 len = olen; 3229 u64 len = olen;
3453 bool same_inode = (src == dst);
3454 u64 same_lock_start = 0;
3455 u64 same_lock_len = 0;
3456
3457 ret = extent_same_check_offsets(src, loff, &len, olen);
3458 if (ret)
3459 return ret;
3460
3461 ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
3462 if (ret)
3463 return ret;
3464
3465 if (same_inode) {
3466 /*
3467 * Single inode case wants the same checks, except we
3468 * don't want our length pushed out past i_size as
3469 * comparing that data range makes no sense.
3470 *
3471 * extent_same_check_offsets() will do this for an
3472 * unaligned length at i_size, so catch it here and
3473 * reject the request.
3474 *
3475 * This effectively means we require aligned extents
3476 * for the single-inode case, whereas the other cases
3477 * allow an unaligned length so long as it ends at
3478 * i_size.
3479 */
3480 if (len != olen)
3481 return -EINVAL;
3482 3230
3483 /* Check for overlapping ranges */ 3231 if (loff + len == src->i_size)
3484 if (dst_loff + len > loff && dst_loff < loff + len) 3232 len = ALIGN(src->i_size, bs) - loff;
3485 return -EINVAL;
3486
3487 same_lock_start = min_t(u64, loff, dst_loff);
3488 same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
3489 } else {
3490 /*
3491 * If the source and destination inodes are different, the
3492 * source's range end offset matches the source's i_size, that
3493 * i_size is not a multiple of the sector size, and the
3494 * destination range does not go past the destination's i_size,
3495 * we must round down the length to the nearest sector size
3496 * multiple. If we don't do this adjustment we end replacing
3497 * with zeroes the bytes in the range that starts at the
3498 * deduplication range's end offset and ends at the next sector
3499 * size multiple.
3500 */
3501 if (loff + olen == i_size_read(src) &&
3502 dst_loff + len < i_size_read(dst)) {
3503 const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
3504
3505 len = round_down(i_size_read(src), sz) - loff;
3506 if (len == 0)
3507 return 0;
3508 olen = len;
3509 }
3510 }
3511
3512again:
3513 ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
3514 if (ret)
3515 return ret;
3516
3517 if (same_inode)
3518 ret = lock_extent_range(src, same_lock_start, same_lock_len,
3519 false);
3520 else
3521 ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
3522 false);
3523 /* 3233 /*
3524 * If one of the inodes has dirty pages in the respective range or 3234 * For same inode case we don't want our length pushed out past i_size
3525 * ordered extents, we need to flush dellaloc and wait for all ordered 3235 * as comparing that data range makes no sense.
3526 * extents in the range. We must unlock the pages and the ranges in the 3236 *
3527 * io trees to avoid deadlocks when flushing delalloc (requires locking 3237 * This effectively means we require aligned extents for the single
3528 * pages) and when waiting for ordered extents to complete (they require 3238 * inode case, whereas the other cases allow an unaligned length so long
3529 * range locking). 3239 * as it ends at i_size.
3530 */ 3240 */
3531 if (ret == -EAGAIN) { 3241 if (dst == src && len != olen)
3532 /* 3242 return -EINVAL;
3533 * Ranges in the io trees already unlocked. Now unlock all
3534 * pages before waiting for all IO to complete.
3535 */
3536 btrfs_cmp_data_free(cmp);
3537 if (same_inode) {
3538 btrfs_wait_ordered_range(src, same_lock_start,
3539 same_lock_len);
3540 } else {
3541 btrfs_wait_ordered_range(src, loff, len);
3542 btrfs_wait_ordered_range(dst, dst_loff, len);
3543 }
3544 goto again;
3545 }
3546 ASSERT(ret == 0);
3547 if (WARN_ON(ret)) {
3548 /* ranges in the io trees already unlocked */
3549 btrfs_cmp_data_free(cmp);
3550 return ret;
3551 }
3552
3553 /* pass original length for comparison so we stay within i_size */
3554 ret = btrfs_cmp_data(olen, cmp);
3555 if (ret == 0)
3556 ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
3557
3558 if (same_inode)
3559 unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
3560 same_lock_start + same_lock_len - 1);
3561 else
3562 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
3563 3243
3564 btrfs_cmp_data_free(cmp); 3244 /*
3245 * Lock destination range to serialize with concurrent readpages().
3246 */
3247 lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);
3248 ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
3249 unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);
3565 3250
3566 return ret; 3251 return ret;
3567} 3252}
@@ -3572,63 +3257,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
3572 struct inode *dst, u64 dst_loff) 3257 struct inode *dst, u64 dst_loff)
3573{ 3258{
3574 int ret; 3259 int ret;
3575 struct cmp_pages cmp;
3576 int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; 3260 int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
3577 bool same_inode = (src == dst);
3578 u64 i, tail_len, chunk_count; 3261 u64 i, tail_len, chunk_count;
3579 3262
3580 if (olen == 0)
3581 return 0;
3582
3583 if (same_inode)
3584 inode_lock(src);
3585 else
3586 btrfs_double_inode_lock(src, dst);
3587
3588 /* don't make the dst file partly checksummed */ 3263 /* don't make the dst file partly checksummed */
3589 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 3264 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
3590 (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { 3265 (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM))
3591 ret = -EINVAL; 3266 return -EINVAL;
3592 goto out_unlock;
3593 }
3594 3267
3595 if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) { 3268 if (IS_SWAPFILE(src) || IS_SWAPFILE(dst))
3596 ret = -ETXTBSY; 3269 return -ETXTBSY;
3597 goto out_unlock;
3598 }
3599 3270
3600 tail_len = olen % BTRFS_MAX_DEDUPE_LEN; 3271 tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
3601 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); 3272 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
3602 if (chunk_count == 0) 3273 if (chunk_count == 0)
3603 num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; 3274 num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
3604 3275
3605 /*
3606 * If deduping ranges in the same inode, locking rules make it
3607 * mandatory to always lock pages in ascending order to avoid deadlocks
3608 * with concurrent tasks (such as starting writeback/delalloc).
3609 */
3610 if (same_inode && dst_loff < loff)
3611 swap(loff, dst_loff);
3612
3613 /*
3614 * We must gather up all the pages before we initiate our extent
3615 * locking. We use an array for the page pointers. Size of the array is
3616 * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
3617 */
3618 cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
3619 GFP_KERNEL | __GFP_ZERO);
3620 cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
3621 GFP_KERNEL | __GFP_ZERO);
3622 if (!cmp.src_pages || !cmp.dst_pages) {
3623 ret = -ENOMEM;
3624 goto out_free;
3625 }
3626
3627 for (i = 0; i < chunk_count; i++) { 3276 for (i = 0; i < chunk_count; i++) {
3628 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, 3277 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
3629 dst, dst_loff, &cmp); 3278 dst, dst_loff);
3630 if (ret) 3279 if (ret)
3631 goto out_free; 3280 return ret;
3632 3281
3633 loff += BTRFS_MAX_DEDUPE_LEN; 3282 loff += BTRFS_MAX_DEDUPE_LEN;
3634 dst_loff += BTRFS_MAX_DEDUPE_LEN; 3283 dst_loff += BTRFS_MAX_DEDUPE_LEN;
@@ -3636,17 +3285,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
3636 3285
3637 if (tail_len > 0) 3286 if (tail_len > 0)
3638 ret = btrfs_extent_same_range(src, loff, tail_len, dst, 3287 ret = btrfs_extent_same_range(src, loff, tail_len, dst,
3639 dst_loff, &cmp); 3288 dst_loff);
3640
3641out_free:
3642 kvfree(cmp.src_pages);
3643 kvfree(cmp.dst_pages);
3644
3645out_unlock:
3646 if (same_inode)
3647 inode_unlock(src);
3648 else
3649 btrfs_double_inode_unlock(src, dst);
3650 3289
3651 return ret; 3290 return ret;
3652} 3291}
@@ -4233,11 +3872,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
4233 struct inode *inode = file_inode(file); 3872 struct inode *inode = file_inode(file);
4234 struct inode *src = file_inode(file_src); 3873 struct inode *src = file_inode(file_src);
4235 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3874 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4236 struct btrfs_root *root = BTRFS_I(inode)->root;
4237 int ret; 3875 int ret;
4238 u64 len = olen; 3876 u64 len = olen;
4239 u64 bs = fs_info->sb->s_blocksize; 3877 u64 bs = fs_info->sb->s_blocksize;
4240 int same_inode = src == inode;
4241 3878
4242 /* 3879 /*
4243 * TODO: 3880 * TODO:
@@ -4250,106 +3887,35 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
4250 * be either compressed or non-compressed. 3887 * be either compressed or non-compressed.
4251 */ 3888 */
4252 3889
4253 if (btrfs_root_readonly(root))
4254 return -EROFS;
4255
4256 if (file_src->f_path.mnt != file->f_path.mnt ||
4257 src->i_sb != inode->i_sb)
4258 return -EXDEV;
4259
4260 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
4261 return -EISDIR;
4262
4263 if (!same_inode) {
4264 btrfs_double_inode_lock(src, inode);
4265 } else {
4266 inode_lock(src);
4267 }
4268
4269 /* don't make the dst file partly checksummed */ 3890 /* don't make the dst file partly checksummed */
4270 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 3891 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
4271 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 3892 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
4272 ret = -EINVAL; 3893 return -EINVAL;
4273 goto out_unlock;
4274 }
4275 3894
4276 if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) { 3895 if (IS_SWAPFILE(src) || IS_SWAPFILE(inode))
4277 ret = -ETXTBSY; 3896 return -ETXTBSY;
4278 goto out_unlock;
4279 }
4280 3897
4281 /* determine range to clone */
4282 ret = -EINVAL;
4283 if (off + len > src->i_size || off + len < off)
4284 goto out_unlock;
4285 if (len == 0)
4286 olen = len = src->i_size - off;
4287 /* 3898 /*
4288 * If we extend to eof, continue to block boundary if and only if the 3899 * VFS's generic_remap_file_range_prep() protects us from cloning the
4289 * destination end offset matches the destination file's size, otherwise 3900 * eof block into the middle of a file, which would result in corruption
4290 * we would be corrupting data by placing the eof block into the middle 3901 * if the file size is not blocksize aligned. So we don't need to check
4291 * of a file. 3902 * for that case here.
4292 */ 3903 */
4293 if (off + len == src->i_size) { 3904 if (off + len == src->i_size)
4294 if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
4295 goto out_unlock;
4296 len = ALIGN(src->i_size, bs) - off; 3905 len = ALIGN(src->i_size, bs) - off;
4297 }
4298
4299 if (len == 0) {
4300 ret = 0;
4301 goto out_unlock;
4302 }
4303
4304 /* verify the end result is block aligned */
4305 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
4306 !IS_ALIGNED(destoff, bs))
4307 goto out_unlock;
4308
4309 /* verify if ranges are overlapped within the same file */
4310 if (same_inode) {
4311 if (destoff + len > off && destoff < off + len)
4312 goto out_unlock;
4313 }
4314 3906
4315 if (destoff > inode->i_size) { 3907 if (destoff > inode->i_size) {
4316 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 3908 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
4317 if (ret) 3909 if (ret)
4318 goto out_unlock; 3910 return ret;
4319 } 3911 }
4320 3912
4321 /* 3913 /*
4322 * Lock the target range too. Right after we replace the file extent 3914 * Lock destination range to serialize with concurrent readpages().
4323 * items in the fs tree (which now point to the cloned data), we might
4324 * have a worker replace them with extent items relative to a write
4325 * operation that was issued before this clone operation (i.e. confront
4326 * with inode.c:btrfs_finish_ordered_io).
4327 */ 3915 */
4328 if (same_inode) { 3916 lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);
4329 u64 lock_start = min_t(u64, off, destoff);
4330 u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
4331
4332 ret = lock_extent_range(src, lock_start, lock_len, true);
4333 } else {
4334 ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
4335 true);
4336 }
4337 ASSERT(ret == 0);
4338 if (WARN_ON(ret)) {
4339 /* ranges in the io trees already unlocked */
4340 goto out_unlock;
4341 }
4342
4343 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 3917 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
4344 3918 unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);
4345 if (same_inode) {
4346 u64 lock_start = min_t(u64, off, destoff);
4347 u64 lock_end = max_t(u64, off, destoff) + len - 1;
4348
4349 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
4350 } else {
4351 btrfs_double_extent_unlock(src, off, inode, destoff, len);
4352 }
4353 /* 3919 /*
4354 * Truncate page cache pages so that future reads will see the cloned 3920 * Truncate page cache pages so that future reads will see the cloned
4355 * data immediately and not the previous data. 3921 * data immediately and not the previous data.
@@ -4357,11 +3923,87 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
4357 truncate_inode_pages_range(&inode->i_data, 3923 truncate_inode_pages_range(&inode->i_data,
4358 round_down(destoff, PAGE_SIZE), 3924 round_down(destoff, PAGE_SIZE),
4359 round_up(destoff + len, PAGE_SIZE) - 1); 3925 round_up(destoff + len, PAGE_SIZE) - 1);
4360out_unlock: 3926
3927 return ret;
3928}
3929
3930static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
3931 struct file *file_out, loff_t pos_out,
3932 loff_t *len, unsigned int remap_flags)
3933{
3934 struct inode *inode_in = file_inode(file_in);
3935 struct inode *inode_out = file_inode(file_out);
3936 u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
3937 bool same_inode = inode_out == inode_in;
3938 u64 wb_len;
3939 int ret;
3940
3941 if (!(remap_flags & REMAP_FILE_DEDUP)) {
3942 struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
3943
3944 if (btrfs_root_readonly(root_out))
3945 return -EROFS;
3946
3947 if (file_in->f_path.mnt != file_out->f_path.mnt ||
3948 inode_in->i_sb != inode_out->i_sb)
3949 return -EXDEV;
3950 }
3951
3952 if (same_inode)
3953 inode_lock(inode_in);
3954 else
3955 btrfs_double_inode_lock(inode_in, inode_out);
3956
3957 /*
3958 * Now that the inodes are locked, we need to start writeback ourselves
3959 * and can not rely on the writeback from the VFS's generic helper
3960 * generic_remap_file_range_prep() because:
3961 *
3962 * 1) For compression we must call filemap_fdatawrite_range() range
3963 * twice (btrfs_fdatawrite_range() does it for us), and the generic
3964 * helper only calls it once;
3965 *
3966 * 2) filemap_fdatawrite_range(), called by the generic helper only
3967 * waits for the writeback to complete, i.e. for IO to be done, and
3968 * not for the ordered extents to complete. We need to wait for them
3969 * to complete so that new file extent items are in the fs tree.
3970 */
3971 if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
3972 wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
3973 else
3974 wb_len = ALIGN(*len, bs);
3975
3976 /*
3977 * Since we don't lock ranges, wait for ongoing lockless dio writes (as
3978 * any in progress could create its ordered extents after we wait for
3979 * existing ordered extents below).
3980 */
3981 inode_dio_wait(inode_in);
4361 if (!same_inode) 3982 if (!same_inode)
4362 btrfs_double_inode_unlock(src, inode); 3983 inode_dio_wait(inode_out);
3984
3985 ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
3986 wb_len);
3987 if (ret < 0)
3988 goto out_unlock;
3989 ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
3990 wb_len);
3991 if (ret < 0)
3992 goto out_unlock;
3993
3994 ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
3995 len, remap_flags);
3996 if (ret < 0 || *len == 0)
3997 goto out_unlock;
3998
3999 return 0;
4000
4001 out_unlock:
4002 if (same_inode)
4003 inode_unlock(inode_in);
4363 else 4004 else
4364 inode_unlock(src); 4005 btrfs_double_inode_unlock(inode_in, inode_out);
4006
4365 return ret; 4007 return ret;
4366} 4008}
4367 4009
@@ -4369,29 +4011,29 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
4369 struct file *dst_file, loff_t destoff, loff_t len, 4011 struct file *dst_file, loff_t destoff, loff_t len,
4370 unsigned int remap_flags) 4012 unsigned int remap_flags)
4371{ 4013{
4014 struct inode *src_inode = file_inode(src_file);
4015 struct inode *dst_inode = file_inode(dst_file);
4016 bool same_inode = dst_inode == src_inode;
4372 int ret; 4017 int ret;
4373 4018
4374 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 4019 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
4375 return -EINVAL; 4020 return -EINVAL;
4376 4021
4377 if (remap_flags & REMAP_FILE_DEDUP) { 4022 ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
4378 struct inode *src = file_inode(src_file); 4023 &len, remap_flags);
4379 struct inode *dst = file_inode(dst_file); 4024 if (ret < 0 || len == 0)
4380 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; 4025 return ret;
4381
4382 if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
4383 /*
4384 * Btrfs does not support blocksize < page_size. As a
4385 * result, btrfs_cmp_data() won't correctly handle
4386 * this situation without an update.
4387 */
4388 return -EINVAL;
4389 }
4390 4026
4391 ret = btrfs_extent_same(src, off, len, dst, destoff); 4027 if (remap_flags & REMAP_FILE_DEDUP)
4392 } else { 4028 ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
4029 else
4393 ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); 4030 ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
4394 } 4031
4032 if (same_inode)
4033 inode_unlock(src_inode);
4034 else
4035 btrfs_double_inode_unlock(src_inode, dst_inode);
4036
4395 return ret < 0 ? ret : len; 4037 return ret < 0 ? ret : len;
4396} 4038}
4397 4039