diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
| -rw-r--r-- | fs/xfs/xfs_file.c | 232 |
1 files changed, 33 insertions, 199 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a314fc7b56fa..6e4f7f900fea 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
| @@ -249,6 +249,7 @@ xfs_file_dio_aio_read( | |||
| 249 | struct xfs_inode *ip = XFS_I(inode); | 249 | struct xfs_inode *ip = XFS_I(inode); |
| 250 | loff_t isize = i_size_read(inode); | 250 | loff_t isize = i_size_read(inode); |
| 251 | size_t count = iov_iter_count(to); | 251 | size_t count = iov_iter_count(to); |
| 252 | loff_t end = iocb->ki_pos + count - 1; | ||
| 252 | struct iov_iter data; | 253 | struct iov_iter data; |
| 253 | struct xfs_buftarg *target; | 254 | struct xfs_buftarg *target; |
| 254 | ssize_t ret = 0; | 255 | ssize_t ret = 0; |
| @@ -272,49 +273,21 @@ xfs_file_dio_aio_read( | |||
| 272 | 273 | ||
| 273 | file_accessed(iocb->ki_filp); | 274 | file_accessed(iocb->ki_filp); |
| 274 | 275 | ||
| 275 | /* | ||
| 276 | * Locking is a bit tricky here. If we take an exclusive lock for direct | ||
| 277 | * IO, we effectively serialise all new concurrent read IO to this file | ||
| 278 | * and block it behind IO that is currently in progress because IO in | ||
| 279 | * progress holds the IO lock shared. We only need to hold the lock | ||
| 280 | * exclusive to blow away the page cache, so only take lock exclusively | ||
| 281 | * if the page cache needs invalidation. This allows the normal direct | ||
| 282 | * IO case of no page cache pages to proceeed concurrently without | ||
| 283 | * serialisation. | ||
| 284 | */ | ||
| 285 | xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); | 276 | xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); |
| 286 | if (mapping->nrpages) { | 277 | if (mapping->nrpages) { |
| 287 | xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); | 278 | ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); |
| 288 | xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); | 279 | if (ret) |
| 280 | goto out_unlock; | ||
| 289 | 281 | ||
| 290 | /* | 282 | /* |
| 291 | * The generic dio code only flushes the range of the particular | 283 | * Invalidate whole pages. This can return an error if we fail |
| 292 | * I/O. Because we take an exclusive lock here, this whole | 284 | * to invalidate a page, but this should never happen on XFS. |
| 293 | * sequence is considerably more expensive for us. This has a | 285 | * Warn if it does fail. |
| 294 | * noticeable performance impact for any file with cached pages, | ||
| 295 | * even when outside of the range of the particular I/O. | ||
| 296 | * | ||
| 297 | * Hence, amortize the cost of the lock against a full file | ||
| 298 | * flush and reduce the chances of repeated iolock cycles going | ||
| 299 | * forward. | ||
| 300 | */ | 286 | */ |
| 301 | if (mapping->nrpages) { | 287 | ret = invalidate_inode_pages2_range(mapping, |
| 302 | ret = filemap_write_and_wait(mapping); | 288 | iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); |
| 303 | if (ret) { | 289 | WARN_ON_ONCE(ret); |
| 304 | xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); | 290 | ret = 0; |
| 305 | return ret; | ||
| 306 | } | ||
| 307 | |||
| 308 | /* | ||
| 309 | * Invalidate whole pages. This can return an error if | ||
| 310 | * we fail to invalidate a page, but this should never | ||
| 311 | * happen on XFS. Warn if it does fail. | ||
| 312 | */ | ||
| 313 | ret = invalidate_inode_pages2(mapping); | ||
| 314 | WARN_ON_ONCE(ret); | ||
| 315 | ret = 0; | ||
| 316 | } | ||
| 317 | xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); | ||
| 318 | } | 291 | } |
| 319 | 292 | ||
| 320 | data = *to; | 293 | data = *to; |
| @@ -324,8 +297,9 @@ xfs_file_dio_aio_read( | |||
| 324 | iocb->ki_pos += ret; | 297 | iocb->ki_pos += ret; |
| 325 | iov_iter_advance(to, ret); | 298 | iov_iter_advance(to, ret); |
| 326 | } | 299 | } |
| 327 | xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); | ||
| 328 | 300 | ||
| 301 | out_unlock: | ||
| 302 | xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); | ||
| 329 | return ret; | 303 | return ret; |
| 330 | } | 304 | } |
| 331 | 305 | ||
| @@ -570,61 +544,49 @@ xfs_file_dio_aio_write( | |||
| 570 | if ((iocb->ki_pos | count) & target->bt_logical_sectormask) | 544 | if ((iocb->ki_pos | count) & target->bt_logical_sectormask) |
| 571 | return -EINVAL; | 545 | return -EINVAL; |
| 572 | 546 | ||
| 573 | /* "unaligned" here means not aligned to a filesystem block */ | ||
| 574 | if ((iocb->ki_pos & mp->m_blockmask) || | ||
| 575 | ((iocb->ki_pos + count) & mp->m_blockmask)) | ||
| 576 | unaligned_io = 1; | ||
| 577 | |||
| 578 | /* | 547 | /* |
| 579 | * We don't need to take an exclusive lock unless there page cache needs | 548 | * Don't take the exclusive iolock here unless the I/O is unaligned to |
| 580 | * to be invalidated or unaligned IO is being executed. We don't need to | 549 | * the file system block size. We don't need to consider the EOF |
| 581 | * consider the EOF extension case here because | 550 | * extension case here because xfs_file_aio_write_checks() will relock |
| 582 | * xfs_file_aio_write_checks() will relock the inode as necessary for | 551 | * the inode as necessary for EOF zeroing cases and fill out the new |
| 583 | * EOF zeroing cases and fill out the new inode size as appropriate. | 552 | * inode size as appropriate. |
| 584 | */ | 553 | */ |
| 585 | if (unaligned_io || mapping->nrpages) | 554 | if ((iocb->ki_pos & mp->m_blockmask) || |
| 555 | ((iocb->ki_pos + count) & mp->m_blockmask)) { | ||
| 556 | unaligned_io = 1; | ||
| 586 | iolock = XFS_IOLOCK_EXCL; | 557 | iolock = XFS_IOLOCK_EXCL; |
| 587 | else | 558 | } else { |
| 588 | iolock = XFS_IOLOCK_SHARED; | 559 | iolock = XFS_IOLOCK_SHARED; |
| 589 | xfs_rw_ilock(ip, iolock); | ||
| 590 | |||
| 591 | /* | ||
| 592 | * Recheck if there are cached pages that need invalidate after we got | ||
| 593 | * the iolock to protect against other threads adding new pages while | ||
| 594 | * we were waiting for the iolock. | ||
| 595 | */ | ||
| 596 | if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { | ||
| 597 | xfs_rw_iunlock(ip, iolock); | ||
| 598 | iolock = XFS_IOLOCK_EXCL; | ||
| 599 | xfs_rw_ilock(ip, iolock); | ||
| 600 | } | 560 | } |
| 601 | 561 | ||
| 562 | xfs_rw_ilock(ip, iolock); | ||
| 563 | |||
| 602 | ret = xfs_file_aio_write_checks(iocb, from, &iolock); | 564 | ret = xfs_file_aio_write_checks(iocb, from, &iolock); |
| 603 | if (ret) | 565 | if (ret) |
| 604 | goto out; | 566 | goto out; |
| 605 | count = iov_iter_count(from); | 567 | count = iov_iter_count(from); |
| 606 | end = iocb->ki_pos + count - 1; | 568 | end = iocb->ki_pos + count - 1; |
| 607 | 569 | ||
| 608 | /* | ||
| 609 | * See xfs_file_dio_aio_read() for why we do a full-file flush here. | ||
| 610 | */ | ||
| 611 | if (mapping->nrpages) { | 570 | if (mapping->nrpages) { |
| 612 | ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); | 571 | ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); |
| 613 | if (ret) | 572 | if (ret) |
| 614 | goto out; | 573 | goto out; |
| 574 | |||
| 615 | /* | 575 | /* |
| 616 | * Invalidate whole pages. This can return an error if we fail | 576 | * Invalidate whole pages. This can return an error if we fail |
| 617 | * to invalidate a page, but this should never happen on XFS. | 577 | * to invalidate a page, but this should never happen on XFS. |
| 618 | * Warn if it does fail. | 578 | * Warn if it does fail. |
| 619 | */ | 579 | */ |
| 620 | ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); | 580 | ret = invalidate_inode_pages2_range(mapping, |
| 581 | iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); | ||
| 621 | WARN_ON_ONCE(ret); | 582 | WARN_ON_ONCE(ret); |
| 622 | ret = 0; | 583 | ret = 0; |
| 623 | } | 584 | } |
| 624 | 585 | ||
| 625 | /* | 586 | /* |
| 626 | * If we are doing unaligned IO, wait for all other IO to drain, | 587 | * If we are doing unaligned IO, wait for all other IO to drain, |
| 627 | * otherwise demote the lock if we had to flush cached pages | 588 | * otherwise demote the lock if we had to take the exclusive lock |
| 589 | * for other reasons in xfs_file_aio_write_checks. | ||
| 628 | */ | 590 | */ |
| 629 | if (unaligned_io) | 591 | if (unaligned_io) |
| 630 | inode_dio_wait(inode); | 592 | inode_dio_wait(inode); |
| @@ -947,134 +909,6 @@ out_unlock: | |||
| 947 | return error; | 909 | return error; |
| 948 | } | 910 | } |
| 949 | 911 | ||
| 950 | /* | ||
| 951 | * Flush all file writes out to disk. | ||
| 952 | */ | ||
| 953 | static int | ||
| 954 | xfs_file_wait_for_io( | ||
| 955 | struct inode *inode, | ||
| 956 | loff_t offset, | ||
| 957 | size_t len) | ||
| 958 | { | ||
| 959 | loff_t rounding; | ||
| 960 | loff_t ioffset; | ||
| 961 | loff_t iendoffset; | ||
| 962 | loff_t bs; | ||
| 963 | int ret; | ||
| 964 | |||
| 965 | bs = inode->i_sb->s_blocksize; | ||
| 966 | inode_dio_wait(inode); | ||
| 967 | |||
| 968 | rounding = max_t(xfs_off_t, bs, PAGE_SIZE); | ||
| 969 | ioffset = round_down(offset, rounding); | ||
| 970 | iendoffset = round_up(offset + len, rounding) - 1; | ||
| 971 | ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, | ||
| 972 | iendoffset); | ||
| 973 | return ret; | ||
| 974 | } | ||
| 975 | |||
| 976 | /* Hook up to the VFS reflink function */ | ||
| 977 | STATIC int | ||
| 978 | xfs_file_share_range( | ||
| 979 | struct file *file_in, | ||
| 980 | loff_t pos_in, | ||
| 981 | struct file *file_out, | ||
| 982 | loff_t pos_out, | ||
| 983 | u64 len, | ||
| 984 | bool is_dedupe) | ||
| 985 | { | ||
| 986 | struct inode *inode_in; | ||
| 987 | struct inode *inode_out; | ||
| 988 | ssize_t ret; | ||
| 989 | loff_t bs; | ||
| 990 | loff_t isize; | ||
| 991 | int same_inode; | ||
| 992 | loff_t blen; | ||
| 993 | unsigned int flags = 0; | ||
| 994 | |||
| 995 | inode_in = file_inode(file_in); | ||
| 996 | inode_out = file_inode(file_out); | ||
| 997 | bs = inode_out->i_sb->s_blocksize; | ||
| 998 | |||
| 999 | /* Don't touch certain kinds of inodes */ | ||
| 1000 | if (IS_IMMUTABLE(inode_out)) | ||
| 1001 | return -EPERM; | ||
| 1002 | if (IS_SWAPFILE(inode_in) || | ||
| 1003 | IS_SWAPFILE(inode_out)) | ||
| 1004 | return -ETXTBSY; | ||
| 1005 | |||
| 1006 | /* Reflink only works within this filesystem. */ | ||
| 1007 | if (inode_in->i_sb != inode_out->i_sb) | ||
| 1008 | return -EXDEV; | ||
| 1009 | same_inode = (inode_in->i_ino == inode_out->i_ino); | ||
| 1010 | |||
| 1011 | /* Don't reflink dirs, pipes, sockets... */ | ||
| 1012 | if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) | ||
| 1013 | return -EISDIR; | ||
| 1014 | if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) | ||
| 1015 | return -EINVAL; | ||
| 1016 | if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) | ||
| 1017 | return -EINVAL; | ||
| 1018 | |||
| 1019 | /* Don't share DAX file data for now. */ | ||
| 1020 | if (IS_DAX(inode_in) || IS_DAX(inode_out)) | ||
| 1021 | return -EINVAL; | ||
| 1022 | |||
| 1023 | /* Are we going all the way to the end? */ | ||
| 1024 | isize = i_size_read(inode_in); | ||
| 1025 | if (isize == 0) | ||
| 1026 | return 0; | ||
| 1027 | if (len == 0) | ||
| 1028 | len = isize - pos_in; | ||
| 1029 | |||
| 1030 | /* Ensure offsets don't wrap and the input is inside i_size */ | ||
| 1031 | if (pos_in + len < pos_in || pos_out + len < pos_out || | ||
| 1032 | pos_in + len > isize) | ||
| 1033 | return -EINVAL; | ||
| 1034 | |||
| 1035 | /* Don't allow dedupe past EOF in the dest file */ | ||
| 1036 | if (is_dedupe) { | ||
| 1037 | loff_t disize; | ||
| 1038 | |||
| 1039 | disize = i_size_read(inode_out); | ||
| 1040 | if (pos_out >= disize || pos_out + len > disize) | ||
| 1041 | return -EINVAL; | ||
| 1042 | } | ||
| 1043 | |||
| 1044 | /* If we're linking to EOF, continue to the block boundary. */ | ||
| 1045 | if (pos_in + len == isize) | ||
| 1046 | blen = ALIGN(isize, bs) - pos_in; | ||
| 1047 | else | ||
| 1048 | blen = len; | ||
| 1049 | |||
| 1050 | /* Only reflink if we're aligned to block boundaries */ | ||
| 1051 | if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || | ||
| 1052 | !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) | ||
| 1053 | return -EINVAL; | ||
| 1054 | |||
| 1055 | /* Don't allow overlapped reflink within the same file */ | ||
| 1056 | if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) | ||
| 1057 | return -EINVAL; | ||
| 1058 | |||
| 1059 | /* Wait for the completion of any pending IOs on srcfile */ | ||
| 1060 | ret = xfs_file_wait_for_io(inode_in, pos_in, len); | ||
| 1061 | if (ret) | ||
| 1062 | goto out; | ||
| 1063 | ret = xfs_file_wait_for_io(inode_out, pos_out, len); | ||
| 1064 | if (ret) | ||
| 1065 | goto out; | ||
| 1066 | |||
| 1067 | if (is_dedupe) | ||
| 1068 | flags |= XFS_REFLINK_DEDUPE; | ||
| 1069 | ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), | ||
| 1070 | pos_out, len, flags); | ||
| 1071 | if (ret < 0) | ||
| 1072 | goto out; | ||
| 1073 | |||
| 1074 | out: | ||
| 1075 | return ret; | ||
| 1076 | } | ||
| 1077 | |||
| 1078 | STATIC ssize_t | 912 | STATIC ssize_t |
| 1079 | xfs_file_copy_range( | 913 | xfs_file_copy_range( |
| 1080 | struct file *file_in, | 914 | struct file *file_in, |
| @@ -1086,7 +920,7 @@ xfs_file_copy_range( | |||
| 1086 | { | 920 | { |
| 1087 | int error; | 921 | int error; |
| 1088 | 922 | ||
| 1089 | error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, | 923 | error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, |
| 1090 | len, false); | 924 | len, false); |
| 1091 | if (error) | 925 | if (error) |
| 1092 | return error; | 926 | return error; |
| @@ -1101,7 +935,7 @@ xfs_file_clone_range( | |||
| 1101 | loff_t pos_out, | 935 | loff_t pos_out, |
| 1102 | u64 len) | 936 | u64 len) |
| 1103 | { | 937 | { |
| 1104 | return xfs_file_share_range(file_in, pos_in, file_out, pos_out, | 938 | return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, |
| 1105 | len, false); | 939 | len, false); |
| 1106 | } | 940 | } |
| 1107 | 941 | ||
| @@ -1124,7 +958,7 @@ xfs_file_dedupe_range( | |||
| 1124 | if (len > XFS_MAX_DEDUPE_LEN) | 958 | if (len > XFS_MAX_DEDUPE_LEN) |
| 1125 | len = XFS_MAX_DEDUPE_LEN; | 959 | len = XFS_MAX_DEDUPE_LEN; |
| 1126 | 960 | ||
| 1127 | error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, | 961 | error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, |
| 1128 | len, true); | 962 | len, true); |
| 1129 | if (error) | 963 | if (error) |
| 1130 | return error; | 964 | return error; |
