diff options
| -rw-r--r-- | fs/btrfs/ioctl.c | 624 |
1 files changed, 133 insertions, 491 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 410c7e007ba8..fab9443f6a42 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
| @@ -3206,92 +3206,6 @@ out: | |||
| 3206 | return ret; | 3206 | return ret; |
| 3207 | } | 3207 | } |
| 3208 | 3208 | ||
| 3209 | static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) | ||
| 3210 | { | ||
| 3211 | struct page *page; | ||
| 3212 | |||
| 3213 | page = grab_cache_page(inode->i_mapping, index); | ||
| 3214 | if (!page) | ||
| 3215 | return ERR_PTR(-ENOMEM); | ||
| 3216 | |||
| 3217 | if (!PageUptodate(page)) { | ||
| 3218 | int ret; | ||
| 3219 | |||
| 3220 | ret = btrfs_readpage(NULL, page); | ||
| 3221 | if (ret) | ||
| 3222 | return ERR_PTR(ret); | ||
| 3223 | lock_page(page); | ||
| 3224 | if (!PageUptodate(page)) { | ||
| 3225 | unlock_page(page); | ||
| 3226 | put_page(page); | ||
| 3227 | return ERR_PTR(-EIO); | ||
| 3228 | } | ||
| 3229 | if (page->mapping != inode->i_mapping) { | ||
| 3230 | unlock_page(page); | ||
| 3231 | put_page(page); | ||
| 3232 | return ERR_PTR(-EAGAIN); | ||
| 3233 | } | ||
| 3234 | } | ||
| 3235 | |||
| 3236 | return page; | ||
| 3237 | } | ||
| 3238 | |||
| 3239 | static int gather_extent_pages(struct inode *inode, struct page **pages, | ||
| 3240 | int num_pages, u64 off) | ||
| 3241 | { | ||
| 3242 | int i; | ||
| 3243 | pgoff_t index = off >> PAGE_SHIFT; | ||
| 3244 | |||
| 3245 | for (i = 0; i < num_pages; i++) { | ||
| 3246 | again: | ||
| 3247 | pages[i] = extent_same_get_page(inode, index + i); | ||
| 3248 | if (IS_ERR(pages[i])) { | ||
| 3249 | int err = PTR_ERR(pages[i]); | ||
| 3250 | |||
| 3251 | if (err == -EAGAIN) | ||
| 3252 | goto again; | ||
| 3253 | pages[i] = NULL; | ||
| 3254 | return err; | ||
| 3255 | } | ||
| 3256 | } | ||
| 3257 | return 0; | ||
| 3258 | } | ||
| 3259 | |||
| 3260 | static int lock_extent_range(struct inode *inode, u64 off, u64 len, | ||
| 3261 | bool retry_range_locking) | ||
| 3262 | { | ||
| 3263 | /* | ||
| 3264 | * Do any pending delalloc/csum calculations on inode, one way or | ||
| 3265 | * another, and lock file content. | ||
| 3266 | * The locking order is: | ||
| 3267 | * | ||
| 3268 | * 1) pages | ||
| 3269 | * 2) range in the inode's io tree | ||
| 3270 | */ | ||
| 3271 | while (1) { | ||
| 3272 | struct btrfs_ordered_extent *ordered; | ||
| 3273 | lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); | ||
| 3274 | ordered = btrfs_lookup_first_ordered_extent(inode, | ||
| 3275 | off + len - 1); | ||
| 3276 | if ((!ordered || | ||
| 3277 | ordered->file_offset + ordered->len <= off || | ||
| 3278 | ordered->file_offset >= off + len) && | ||
| 3279 | !test_range_bit(&BTRFS_I(inode)->io_tree, off, | ||
| 3280 | off + len - 1, EXTENT_DELALLOC, 0, NULL)) { | ||
| 3281 | if (ordered) | ||
| 3282 | btrfs_put_ordered_extent(ordered); | ||
| 3283 | break; | ||
| 3284 | } | ||
| 3285 | unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); | ||
| 3286 | if (ordered) | ||
| 3287 | btrfs_put_ordered_extent(ordered); | ||
| 3288 | if (!retry_range_locking) | ||
| 3289 | return -EAGAIN; | ||
| 3290 | btrfs_wait_ordered_range(inode, off, len); | ||
| 3291 | } | ||
| 3292 | return 0; | ||
| 3293 | } | ||
| 3294 | |||
| 3295 | static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) | 3209 | static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) |
| 3296 | { | 3210 | { |
| 3297 | inode_unlock(inode1); | 3211 | inode_unlock(inode1); |
| @@ -3307,261 +3221,32 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) | |||
| 3307 | inode_lock_nested(inode2, I_MUTEX_CHILD); | 3221 | inode_lock_nested(inode2, I_MUTEX_CHILD); |
| 3308 | } | 3222 | } |
| 3309 | 3223 | ||
| 3310 | static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, | ||
| 3311 | struct inode *inode2, u64 loff2, u64 len) | ||
| 3312 | { | ||
| 3313 | unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); | ||
| 3314 | unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); | ||
| 3315 | } | ||
| 3316 | |||
| 3317 | static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, | ||
| 3318 | struct inode *inode2, u64 loff2, u64 len, | ||
| 3319 | bool retry_range_locking) | ||
| 3320 | { | ||
| 3321 | int ret; | ||
| 3322 | |||
| 3323 | if (inode1 < inode2) { | ||
| 3324 | swap(inode1, inode2); | ||
| 3325 | swap(loff1, loff2); | ||
| 3326 | } | ||
| 3327 | ret = lock_extent_range(inode1, loff1, len, retry_range_locking); | ||
| 3328 | if (ret) | ||
| 3329 | return ret; | ||
| 3330 | ret = lock_extent_range(inode2, loff2, len, retry_range_locking); | ||
| 3331 | if (ret) | ||
| 3332 | unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, | ||
| 3333 | loff1 + len - 1); | ||
| 3334 | return ret; | ||
| 3335 | } | ||
| 3336 | |||
| 3337 | struct cmp_pages { | ||
| 3338 | int num_pages; | ||
| 3339 | struct page **src_pages; | ||
| 3340 | struct page **dst_pages; | ||
| 3341 | }; | ||
| 3342 | |||
| 3343 | static void btrfs_cmp_data_free(struct cmp_pages *cmp) | ||
| 3344 | { | ||
| 3345 | int i; | ||
| 3346 | struct page *pg; | ||
| 3347 | |||
| 3348 | for (i = 0; i < cmp->num_pages; i++) { | ||
| 3349 | pg = cmp->src_pages[i]; | ||
| 3350 | if (pg) { | ||
| 3351 | unlock_page(pg); | ||
| 3352 | put_page(pg); | ||
| 3353 | cmp->src_pages[i] = NULL; | ||
| 3354 | } | ||
| 3355 | pg = cmp->dst_pages[i]; | ||
| 3356 | if (pg) { | ||
| 3357 | unlock_page(pg); | ||
| 3358 | put_page(pg); | ||
| 3359 | cmp->dst_pages[i] = NULL; | ||
| 3360 | } | ||
| 3361 | } | ||
| 3362 | } | ||
| 3363 | |||
| 3364 | static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, | ||
| 3365 | struct inode *dst, u64 dst_loff, | ||
| 3366 | u64 len, struct cmp_pages *cmp) | ||
| 3367 | { | ||
| 3368 | int ret; | ||
| 3369 | int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; | ||
| 3370 | |||
| 3371 | cmp->num_pages = num_pages; | ||
| 3372 | |||
| 3373 | ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); | ||
| 3374 | if (ret) | ||
| 3375 | goto out; | ||
| 3376 | |||
| 3377 | ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); | ||
| 3378 | |||
| 3379 | out: | ||
| 3380 | if (ret) | ||
| 3381 | btrfs_cmp_data_free(cmp); | ||
| 3382 | return ret; | ||
| 3383 | } | ||
| 3384 | |||
| 3385 | static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) | ||
| 3386 | { | ||
| 3387 | int ret = 0; | ||
| 3388 | int i; | ||
| 3389 | struct page *src_page, *dst_page; | ||
| 3390 | unsigned int cmp_len = PAGE_SIZE; | ||
| 3391 | void *addr, *dst_addr; | ||
| 3392 | |||
| 3393 | i = 0; | ||
| 3394 | while (len) { | ||
| 3395 | if (len < PAGE_SIZE) | ||
| 3396 | cmp_len = len; | ||
| 3397 | |||
| 3398 | BUG_ON(i >= cmp->num_pages); | ||
| 3399 | |||
| 3400 | src_page = cmp->src_pages[i]; | ||
| 3401 | dst_page = cmp->dst_pages[i]; | ||
| 3402 | ASSERT(PageLocked(src_page)); | ||
| 3403 | ASSERT(PageLocked(dst_page)); | ||
| 3404 | |||
| 3405 | addr = kmap_atomic(src_page); | ||
| 3406 | dst_addr = kmap_atomic(dst_page); | ||
| 3407 | |||
| 3408 | flush_dcache_page(src_page); | ||
| 3409 | flush_dcache_page(dst_page); | ||
| 3410 | |||
| 3411 | if (memcmp(addr, dst_addr, cmp_len)) | ||
| 3412 | ret = -EBADE; | ||
| 3413 | |||
| 3414 | kunmap_atomic(addr); | ||
| 3415 | kunmap_atomic(dst_addr); | ||
| 3416 | |||
| 3417 | if (ret) | ||
| 3418 | break; | ||
| 3419 | |||
| 3420 | len -= cmp_len; | ||
| 3421 | i++; | ||
| 3422 | } | ||
| 3423 | |||
| 3424 | return ret; | ||
| 3425 | } | ||
| 3426 | |||
| 3427 | static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, | ||
| 3428 | u64 olen) | ||
| 3429 | { | ||
| 3430 | u64 len = *plen; | ||
| 3431 | u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; | ||
| 3432 | |||
| 3433 | if (off + olen > inode->i_size || off + olen < off) | ||
| 3434 | return -EINVAL; | ||
| 3435 | |||
| 3436 | /* if we extend to eof, continue to block boundary */ | ||
| 3437 | if (off + len == inode->i_size) | ||
| 3438 | *plen = len = ALIGN(inode->i_size, bs) - off; | ||
| 3439 | |||
| 3440 | /* Check that we are block aligned - btrfs_clone() requires this */ | ||
| 3441 | if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) | ||
| 3442 | return -EINVAL; | ||
| 3443 | |||
| 3444 | return 0; | ||
| 3445 | } | ||
| 3446 | |||
| 3447 | static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, | 3224 | static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, |
| 3448 | struct inode *dst, u64 dst_loff, | 3225 | struct inode *dst, u64 dst_loff) |
| 3449 | struct cmp_pages *cmp) | ||
| 3450 | { | 3226 | { |
| 3227 | u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; | ||
| 3451 | int ret; | 3228 | int ret; |
| 3452 | u64 len = olen; | 3229 | u64 len = olen; |
| 3453 | bool same_inode = (src == dst); | ||
| 3454 | u64 same_lock_start = 0; | ||
| 3455 | u64 same_lock_len = 0; | ||
| 3456 | |||
| 3457 | ret = extent_same_check_offsets(src, loff, &len, olen); | ||
| 3458 | if (ret) | ||
| 3459 | return ret; | ||
| 3460 | |||
| 3461 | ret = extent_same_check_offsets(dst, dst_loff, &len, olen); | ||
| 3462 | if (ret) | ||
| 3463 | return ret; | ||
| 3464 | |||
| 3465 | if (same_inode) { | ||
| 3466 | /* | ||
| 3467 | * Single inode case wants the same checks, except we | ||
| 3468 | * don't want our length pushed out past i_size as | ||
| 3469 | * comparing that data range makes no sense. | ||
| 3470 | * | ||
| 3471 | * extent_same_check_offsets() will do this for an | ||
| 3472 | * unaligned length at i_size, so catch it here and | ||
| 3473 | * reject the request. | ||
| 3474 | * | ||
| 3475 | * This effectively means we require aligned extents | ||
| 3476 | * for the single-inode case, whereas the other cases | ||
| 3477 | * allow an unaligned length so long as it ends at | ||
| 3478 | * i_size. | ||
| 3479 | */ | ||
| 3480 | if (len != olen) | ||
| 3481 | return -EINVAL; | ||
| 3482 | 3230 | ||
| 3483 | /* Check for overlapping ranges */ | 3231 | if (loff + len == src->i_size) |
| 3484 | if (dst_loff + len > loff && dst_loff < loff + len) | 3232 | len = ALIGN(src->i_size, bs) - loff; |
| 3485 | return -EINVAL; | ||
| 3486 | |||
| 3487 | same_lock_start = min_t(u64, loff, dst_loff); | ||
| 3488 | same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; | ||
| 3489 | } else { | ||
| 3490 | /* | ||
| 3491 | * If the source and destination inodes are different, the | ||
| 3492 | * source's range end offset matches the source's i_size, that | ||
| 3493 | * i_size is not a multiple of the sector size, and the | ||
| 3494 | * destination range does not go past the destination's i_size, | ||
| 3495 | * we must round down the length to the nearest sector size | ||
| 3496 | * multiple. If we don't do this adjustment we end replacing | ||
| 3497 | * with zeroes the bytes in the range that starts at the | ||
| 3498 | * deduplication range's end offset and ends at the next sector | ||
| 3499 | * size multiple. | ||
| 3500 | */ | ||
| 3501 | if (loff + olen == i_size_read(src) && | ||
| 3502 | dst_loff + len < i_size_read(dst)) { | ||
| 3503 | const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; | ||
| 3504 | |||
| 3505 | len = round_down(i_size_read(src), sz) - loff; | ||
| 3506 | if (len == 0) | ||
| 3507 | return 0; | ||
| 3508 | olen = len; | ||
| 3509 | } | ||
| 3510 | } | ||
| 3511 | |||
| 3512 | again: | ||
| 3513 | ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); | ||
| 3514 | if (ret) | ||
| 3515 | return ret; | ||
| 3516 | |||
| 3517 | if (same_inode) | ||
| 3518 | ret = lock_extent_range(src, same_lock_start, same_lock_len, | ||
| 3519 | false); | ||
| 3520 | else | ||
| 3521 | ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, | ||
| 3522 | false); | ||
| 3523 | /* | 3233 | /* |
| 3524 | * If one of the inodes has dirty pages in the respective range or | 3234 | * For same inode case we don't want our length pushed out past i_size |
| 3525 | * ordered extents, we need to flush dellaloc and wait for all ordered | 3235 | * as comparing that data range makes no sense. |
| 3526 | * extents in the range. We must unlock the pages and the ranges in the | 3236 | * |
| 3527 | * io trees to avoid deadlocks when flushing delalloc (requires locking | 3237 | * This effectively means we require aligned extents for the single |
| 3528 | * pages) and when waiting for ordered extents to complete (they require | 3238 | * inode case, whereas the other cases allow an unaligned length so long |
| 3529 | * range locking). | 3239 | * as it ends at i_size. |
| 3530 | */ | 3240 | */ |
| 3531 | if (ret == -EAGAIN) { | 3241 | if (dst == src && len != olen) |
| 3532 | /* | 3242 | return -EINVAL; |
| 3533 | * Ranges in the io trees already unlocked. Now unlock all | ||
| 3534 | * pages before waiting for all IO to complete. | ||
| 3535 | */ | ||
| 3536 | btrfs_cmp_data_free(cmp); | ||
| 3537 | if (same_inode) { | ||
| 3538 | btrfs_wait_ordered_range(src, same_lock_start, | ||
| 3539 | same_lock_len); | ||
| 3540 | } else { | ||
| 3541 | btrfs_wait_ordered_range(src, loff, len); | ||
| 3542 | btrfs_wait_ordered_range(dst, dst_loff, len); | ||
| 3543 | } | ||
| 3544 | goto again; | ||
| 3545 | } | ||
| 3546 | ASSERT(ret == 0); | ||
| 3547 | if (WARN_ON(ret)) { | ||
| 3548 | /* ranges in the io trees already unlocked */ | ||
| 3549 | btrfs_cmp_data_free(cmp); | ||
| 3550 | return ret; | ||
| 3551 | } | ||
| 3552 | |||
| 3553 | /* pass original length for comparison so we stay within i_size */ | ||
| 3554 | ret = btrfs_cmp_data(olen, cmp); | ||
| 3555 | if (ret == 0) | ||
| 3556 | ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); | ||
| 3557 | |||
| 3558 | if (same_inode) | ||
| 3559 | unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, | ||
| 3560 | same_lock_start + same_lock_len - 1); | ||
| 3561 | else | ||
| 3562 | btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); | ||
| 3563 | 3243 | ||
| 3564 | btrfs_cmp_data_free(cmp); | 3244 | /* |
| 3245 | * Lock destination range to serialize with concurrent readpages(). | ||
| 3246 | */ | ||
| 3247 | lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); | ||
| 3248 | ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); | ||
| 3249 | unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); | ||
| 3565 | 3250 | ||
| 3566 | return ret; | 3251 | return ret; |
| 3567 | } | 3252 | } |
| @@ -3572,63 +3257,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, | |||
| 3572 | struct inode *dst, u64 dst_loff) | 3257 | struct inode *dst, u64 dst_loff) |
| 3573 | { | 3258 | { |
| 3574 | int ret; | 3259 | int ret; |
| 3575 | struct cmp_pages cmp; | ||
| 3576 | int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; | 3260 | int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; |
| 3577 | bool same_inode = (src == dst); | ||
| 3578 | u64 i, tail_len, chunk_count; | 3261 | u64 i, tail_len, chunk_count; |
| 3579 | 3262 | ||
| 3580 | if (olen == 0) | ||
| 3581 | return 0; | ||
| 3582 | |||
| 3583 | if (same_inode) | ||
| 3584 | inode_lock(src); | ||
| 3585 | else | ||
| 3586 | btrfs_double_inode_lock(src, dst); | ||
| 3587 | |||
| 3588 | /* don't make the dst file partly checksummed */ | 3263 | /* don't make the dst file partly checksummed */ |
| 3589 | if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != | 3264 | if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != |
| 3590 | (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { | 3265 | (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) |
| 3591 | ret = -EINVAL; | 3266 | return -EINVAL; |
| 3592 | goto out_unlock; | ||
| 3593 | } | ||
| 3594 | 3267 | ||
| 3595 | if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) { | 3268 | if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) |
| 3596 | ret = -ETXTBSY; | 3269 | return -ETXTBSY; |
| 3597 | goto out_unlock; | ||
| 3598 | } | ||
| 3599 | 3270 | ||
| 3600 | tail_len = olen % BTRFS_MAX_DEDUPE_LEN; | 3271 | tail_len = olen % BTRFS_MAX_DEDUPE_LEN; |
| 3601 | chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); | 3272 | chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); |
| 3602 | if (chunk_count == 0) | 3273 | if (chunk_count == 0) |
| 3603 | num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; | 3274 | num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; |
| 3604 | 3275 | ||
| 3605 | /* | ||
| 3606 | * If deduping ranges in the same inode, locking rules make it | ||
| 3607 | * mandatory to always lock pages in ascending order to avoid deadlocks | ||
| 3608 | * with concurrent tasks (such as starting writeback/delalloc). | ||
| 3609 | */ | ||
| 3610 | if (same_inode && dst_loff < loff) | ||
| 3611 | swap(loff, dst_loff); | ||
| 3612 | |||
| 3613 | /* | ||
| 3614 | * We must gather up all the pages before we initiate our extent | ||
| 3615 | * locking. We use an array for the page pointers. Size of the array is | ||
| 3616 | * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. | ||
| 3617 | */ | ||
| 3618 | cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), | ||
| 3619 | GFP_KERNEL | __GFP_ZERO); | ||
| 3620 | cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), | ||
| 3621 | GFP_KERNEL | __GFP_ZERO); | ||
| 3622 | if (!cmp.src_pages || !cmp.dst_pages) { | ||
| 3623 | ret = -ENOMEM; | ||
| 3624 | goto out_free; | ||
| 3625 | } | ||
| 3626 | |||
| 3627 | for (i = 0; i < chunk_count; i++) { | 3276 | for (i = 0; i < chunk_count; i++) { |
| 3628 | ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, | 3277 | ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, |
| 3629 | dst, dst_loff, &cmp); | 3278 | dst, dst_loff); |
| 3630 | if (ret) | 3279 | if (ret) |
| 3631 | goto out_free; | 3280 | return ret; |
| 3632 | 3281 | ||
| 3633 | loff += BTRFS_MAX_DEDUPE_LEN; | 3282 | loff += BTRFS_MAX_DEDUPE_LEN; |
| 3634 | dst_loff += BTRFS_MAX_DEDUPE_LEN; | 3283 | dst_loff += BTRFS_MAX_DEDUPE_LEN; |
| @@ -3636,17 +3285,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, | |||
| 3636 | 3285 | ||
| 3637 | if (tail_len > 0) | 3286 | if (tail_len > 0) |
| 3638 | ret = btrfs_extent_same_range(src, loff, tail_len, dst, | 3287 | ret = btrfs_extent_same_range(src, loff, tail_len, dst, |
| 3639 | dst_loff, &cmp); | 3288 | dst_loff); |
| 3640 | |||
| 3641 | out_free: | ||
| 3642 | kvfree(cmp.src_pages); | ||
| 3643 | kvfree(cmp.dst_pages); | ||
| 3644 | |||
| 3645 | out_unlock: | ||
| 3646 | if (same_inode) | ||
| 3647 | inode_unlock(src); | ||
| 3648 | else | ||
| 3649 | btrfs_double_inode_unlock(src, dst); | ||
| 3650 | 3289 | ||
| 3651 | return ret; | 3290 | return ret; |
| 3652 | } | 3291 | } |
| @@ -4233,11 +3872,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, | |||
| 4233 | struct inode *inode = file_inode(file); | 3872 | struct inode *inode = file_inode(file); |
| 4234 | struct inode *src = file_inode(file_src); | 3873 | struct inode *src = file_inode(file_src); |
| 4235 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | 3874 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); |
| 4236 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
| 4237 | int ret; | 3875 | int ret; |
| 4238 | u64 len = olen; | 3876 | u64 len = olen; |
| 4239 | u64 bs = fs_info->sb->s_blocksize; | 3877 | u64 bs = fs_info->sb->s_blocksize; |
| 4240 | int same_inode = src == inode; | ||
| 4241 | 3878 | ||
| 4242 | /* | 3879 | /* |
| 4243 | * TODO: | 3880 | * TODO: |
| @@ -4250,106 +3887,35 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, | |||
| 4250 | * be either compressed or non-compressed. | 3887 | * be either compressed or non-compressed. |
| 4251 | */ | 3888 | */ |
| 4252 | 3889 | ||
| 4253 | if (btrfs_root_readonly(root)) | ||
| 4254 | return -EROFS; | ||
| 4255 | |||
| 4256 | if (file_src->f_path.mnt != file->f_path.mnt || | ||
| 4257 | src->i_sb != inode->i_sb) | ||
| 4258 | return -EXDEV; | ||
| 4259 | |||
| 4260 | if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) | ||
| 4261 | return -EISDIR; | ||
| 4262 | |||
| 4263 | if (!same_inode) { | ||
| 4264 | btrfs_double_inode_lock(src, inode); | ||
| 4265 | } else { | ||
| 4266 | inode_lock(src); | ||
| 4267 | } | ||
| 4268 | |||
| 4269 | /* don't make the dst file partly checksummed */ | 3890 | /* don't make the dst file partly checksummed */ |
| 4270 | if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != | 3891 | if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != |
| 4271 | (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { | 3892 | (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) |
| 4272 | ret = -EINVAL; | 3893 | return -EINVAL; |
| 4273 | goto out_unlock; | ||
| 4274 | } | ||
| 4275 | 3894 | ||
| 4276 | if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) { | 3895 | if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) |
| 4277 | ret = -ETXTBSY; | 3896 | return -ETXTBSY; |
| 4278 | goto out_unlock; | ||
| 4279 | } | ||
| 4280 | 3897 | ||
| 4281 | /* determine range to clone */ | ||
| 4282 | ret = -EINVAL; | ||
| 4283 | if (off + len > src->i_size || off + len < off) | ||
| 4284 | goto out_unlock; | ||
| 4285 | if (len == 0) | ||
| 4286 | olen = len = src->i_size - off; | ||
| 4287 | /* | 3898 | /* |
| 4288 | * If we extend to eof, continue to block boundary if and only if the | 3899 | * VFS's generic_remap_file_range_prep() protects us from cloning the |
| 4289 | * destination end offset matches the destination file's size, otherwise | 3900 | * eof block into the middle of a file, which would result in corruption |
| 4290 | * we would be corrupting data by placing the eof block into the middle | 3901 | * if the file size is not blocksize aligned. So we don't need to check |
| 4291 | * of a file. | 3902 | * for that case here. |
| 4292 | */ | 3903 | */ |
| 4293 | if (off + len == src->i_size) { | 3904 | if (off + len == src->i_size) |
| 4294 | if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size) | ||
| 4295 | goto out_unlock; | ||
| 4296 | len = ALIGN(src->i_size, bs) - off; | 3905 | len = ALIGN(src->i_size, bs) - off; |
| 4297 | } | ||
| 4298 | |||
| 4299 | if (len == 0) { | ||
| 4300 | ret = 0; | ||
| 4301 | goto out_unlock; | ||
| 4302 | } | ||
| 4303 | |||
| 4304 | /* verify the end result is block aligned */ | ||
| 4305 | if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || | ||
| 4306 | !IS_ALIGNED(destoff, bs)) | ||
| 4307 | goto out_unlock; | ||
| 4308 | |||
| 4309 | /* verify if ranges are overlapped within the same file */ | ||
| 4310 | if (same_inode) { | ||
| 4311 | if (destoff + len > off && destoff < off + len) | ||
| 4312 | goto out_unlock; | ||
| 4313 | } | ||
| 4314 | 3906 | ||
| 4315 | if (destoff > inode->i_size) { | 3907 | if (destoff > inode->i_size) { |
| 4316 | ret = btrfs_cont_expand(inode, inode->i_size, destoff); | 3908 | ret = btrfs_cont_expand(inode, inode->i_size, destoff); |
| 4317 | if (ret) | 3909 | if (ret) |
| 4318 | goto out_unlock; | 3910 | return ret; |
| 4319 | } | 3911 | } |
| 4320 | 3912 | ||
| 4321 | /* | 3913 | /* |
| 4322 | * Lock the target range too. Right after we replace the file extent | 3914 | * Lock destination range to serialize with concurrent readpages(). |
| 4323 | * items in the fs tree (which now point to the cloned data), we might | ||
| 4324 | * have a worker replace them with extent items relative to a write | ||
| 4325 | * operation that was issued before this clone operation (i.e. confront | ||
| 4326 | * with inode.c:btrfs_finish_ordered_io). | ||
| 4327 | */ | 3915 | */ |
| 4328 | if (same_inode) { | 3916 | lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); |
| 4329 | u64 lock_start = min_t(u64, off, destoff); | ||
| 4330 | u64 lock_len = max_t(u64, off, destoff) + len - lock_start; | ||
| 4331 | |||
| 4332 | ret = lock_extent_range(src, lock_start, lock_len, true); | ||
| 4333 | } else { | ||
| 4334 | ret = btrfs_double_extent_lock(src, off, inode, destoff, len, | ||
| 4335 | true); | ||
| 4336 | } | ||
| 4337 | ASSERT(ret == 0); | ||
| 4338 | if (WARN_ON(ret)) { | ||
| 4339 | /* ranges in the io trees already unlocked */ | ||
| 4340 | goto out_unlock; | ||
| 4341 | } | ||
| 4342 | |||
| 4343 | ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); | 3917 | ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); |
| 4344 | 3918 | unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); | |
| 4345 | if (same_inode) { | ||
| 4346 | u64 lock_start = min_t(u64, off, destoff); | ||
| 4347 | u64 lock_end = max_t(u64, off, destoff) + len - 1; | ||
| 4348 | |||
| 4349 | unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); | ||
| 4350 | } else { | ||
| 4351 | btrfs_double_extent_unlock(src, off, inode, destoff, len); | ||
| 4352 | } | ||
| 4353 | /* | 3919 | /* |
| 4354 | * Truncate page cache pages so that future reads will see the cloned | 3920 | * Truncate page cache pages so that future reads will see the cloned |
| 4355 | * data immediately and not the previous data. | 3921 | * data immediately and not the previous data. |
| @@ -4357,11 +3923,87 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, | |||
| 4357 | truncate_inode_pages_range(&inode->i_data, | 3923 | truncate_inode_pages_range(&inode->i_data, |
| 4358 | round_down(destoff, PAGE_SIZE), | 3924 | round_down(destoff, PAGE_SIZE), |
| 4359 | round_up(destoff + len, PAGE_SIZE) - 1); | 3925 | round_up(destoff + len, PAGE_SIZE) - 1); |
| 4360 | out_unlock: | 3926 | |
| 3927 | return ret; | ||
| 3928 | } | ||
| 3929 | |||
| 3930 | static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, | ||
| 3931 | struct file *file_out, loff_t pos_out, | ||
| 3932 | loff_t *len, unsigned int remap_flags) | ||
| 3933 | { | ||
| 3934 | struct inode *inode_in = file_inode(file_in); | ||
| 3935 | struct inode *inode_out = file_inode(file_out); | ||
| 3936 | u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; | ||
| 3937 | bool same_inode = inode_out == inode_in; | ||
| 3938 | u64 wb_len; | ||
| 3939 | int ret; | ||
| 3940 | |||
| 3941 | if (!(remap_flags & REMAP_FILE_DEDUP)) { | ||
| 3942 | struct btrfs_root *root_out = BTRFS_I(inode_out)->root; | ||
| 3943 | |||
| 3944 | if (btrfs_root_readonly(root_out)) | ||
| 3945 | return -EROFS; | ||
| 3946 | |||
| 3947 | if (file_in->f_path.mnt != file_out->f_path.mnt || | ||
| 3948 | inode_in->i_sb != inode_out->i_sb) | ||
| 3949 | return -EXDEV; | ||
| 3950 | } | ||
| 3951 | |||
| 3952 | if (same_inode) | ||
| 3953 | inode_lock(inode_in); | ||
| 3954 | else | ||
| 3955 | btrfs_double_inode_lock(inode_in, inode_out); | ||
| 3956 | |||
| 3957 | /* | ||
| 3958 | * Now that the inodes are locked, we need to start writeback ourselves | ||
| 3959 | * and can not rely on the writeback from the VFS's generic helper | ||
| 3960 | * generic_remap_file_range_prep() because: | ||
| 3961 | * | ||
| 3962 | * 1) For compression we must call filemap_fdatawrite_range() range | ||
| 3963 | * twice (btrfs_fdatawrite_range() does it for us), and the generic | ||
| 3964 | * helper only calls it once; | ||
| 3965 | * | ||
| 3966 | * 2) filemap_fdatawrite_range(), called by the generic helper only | ||
| 3967 | * waits for the writeback to complete, i.e. for IO to be done, and | ||
| 3968 | * not for the ordered extents to complete. We need to wait for them | ||
| 3969 | * to complete so that new file extent items are in the fs tree. | ||
| 3970 | */ | ||
| 3971 | if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) | ||
| 3972 | wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); | ||
| 3973 | else | ||
| 3974 | wb_len = ALIGN(*len, bs); | ||
| 3975 | |||
| 3976 | /* | ||
| 3977 | * Since we don't lock ranges, wait for ongoing lockless dio writes (as | ||
| 3978 | * any in progress could create its ordered extents after we wait for | ||
| 3979 | * existing ordered extents below). | ||
| 3980 | */ | ||
| 3981 | inode_dio_wait(inode_in); | ||
| 4361 | if (!same_inode) | 3982 | if (!same_inode) |
| 4362 | btrfs_double_inode_unlock(src, inode); | 3983 | inode_dio_wait(inode_out); |
| 3984 | |||
| 3985 | ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), | ||
| 3986 | wb_len); | ||
| 3987 | if (ret < 0) | ||
| 3988 | goto out_unlock; | ||
| 3989 | ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), | ||
| 3990 | wb_len); | ||
| 3991 | if (ret < 0) | ||
| 3992 | goto out_unlock; | ||
| 3993 | |||
| 3994 | ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, | ||
| 3995 | len, remap_flags); | ||
| 3996 | if (ret < 0 || *len == 0) | ||
| 3997 | goto out_unlock; | ||
| 3998 | |||
| 3999 | return 0; | ||
| 4000 | |||
| 4001 | out_unlock: | ||
| 4002 | if (same_inode) | ||
| 4003 | inode_unlock(inode_in); | ||
| 4363 | else | 4004 | else |
| 4364 | inode_unlock(src); | 4005 | btrfs_double_inode_unlock(inode_in, inode_out); |
| 4006 | |||
| 4365 | return ret; | 4007 | return ret; |
| 4366 | } | 4008 | } |
| 4367 | 4009 | ||
| @@ -4369,29 +4011,29 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, | |||
| 4369 | struct file *dst_file, loff_t destoff, loff_t len, | 4011 | struct file *dst_file, loff_t destoff, loff_t len, |
| 4370 | unsigned int remap_flags) | 4012 | unsigned int remap_flags) |
| 4371 | { | 4013 | { |
| 4014 | struct inode *src_inode = file_inode(src_file); | ||
| 4015 | struct inode *dst_inode = file_inode(dst_file); | ||
| 4016 | bool same_inode = dst_inode == src_inode; | ||
| 4372 | int ret; | 4017 | int ret; |
| 4373 | 4018 | ||
| 4374 | if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) | 4019 | if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) |
| 4375 | return -EINVAL; | 4020 | return -EINVAL; |
| 4376 | 4021 | ||
| 4377 | if (remap_flags & REMAP_FILE_DEDUP) { | 4022 | ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, |
| 4378 | struct inode *src = file_inode(src_file); | 4023 | &len, remap_flags); |
| 4379 | struct inode *dst = file_inode(dst_file); | 4024 | if (ret < 0 || len == 0) |
| 4380 | u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; | 4025 | return ret; |
| 4381 | |||
| 4382 | if (WARN_ON_ONCE(bs < PAGE_SIZE)) { | ||
| 4383 | /* | ||
| 4384 | * Btrfs does not support blocksize < page_size. As a | ||
| 4385 | * result, btrfs_cmp_data() won't correctly handle | ||
| 4386 | * this situation without an update. | ||
| 4387 | */ | ||
| 4388 | return -EINVAL; | ||
| 4389 | } | ||
| 4390 | 4026 | ||
| 4391 | ret = btrfs_extent_same(src, off, len, dst, destoff); | 4027 | if (remap_flags & REMAP_FILE_DEDUP) |
| 4392 | } else { | 4028 | ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); |
| 4029 | else | ||
| 4393 | ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); | 4030 | ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); |
| 4394 | } | 4031 | |
| 4032 | if (same_inode) | ||
| 4033 | inode_unlock(src_inode); | ||
| 4034 | else | ||
| 4035 | btrfs_double_inode_unlock(src_inode, dst_inode); | ||
| 4036 | |||
| 4395 | return ret < 0 ? ret : len; | 4037 | return ret < 0 ? ret : len; |
| 4396 | } | 4038 | } |
| 4397 | 4039 | ||
