aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c232
1 files changed, 33 insertions, 199 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a314fc7b56fa..6e4f7f900fea 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -249,6 +249,7 @@ xfs_file_dio_aio_read(
249 struct xfs_inode *ip = XFS_I(inode); 249 struct xfs_inode *ip = XFS_I(inode);
250 loff_t isize = i_size_read(inode); 250 loff_t isize = i_size_read(inode);
251 size_t count = iov_iter_count(to); 251 size_t count = iov_iter_count(to);
252 loff_t end = iocb->ki_pos + count - 1;
252 struct iov_iter data; 253 struct iov_iter data;
253 struct xfs_buftarg *target; 254 struct xfs_buftarg *target;
254 ssize_t ret = 0; 255 ssize_t ret = 0;
@@ -272,49 +273,21 @@ xfs_file_dio_aio_read(
272 273
273 file_accessed(iocb->ki_filp); 274 file_accessed(iocb->ki_filp);
274 275
275 /*
276 * Locking is a bit tricky here. If we take an exclusive lock for direct
277 * IO, we effectively serialise all new concurrent read IO to this file
278 * and block it behind IO that is currently in progress because IO in
279 * progress holds the IO lock shared. We only need to hold the lock
280 * exclusive to blow away the page cache, so only take lock exclusively
281 * if the page cache needs invalidation. This allows the normal direct
282 * IO case of no page cache pages to proceeed concurrently without
283 * serialisation.
284 */
285 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 276 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
286 if (mapping->nrpages) { 277 if (mapping->nrpages) {
287 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 278 ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
288 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 279 if (ret)
280 goto out_unlock;
289 281
290 /* 282 /*
291 * The generic dio code only flushes the range of the particular 283 * Invalidate whole pages. This can return an error if we fail
292 * I/O. Because we take an exclusive lock here, this whole 284 * to invalidate a page, but this should never happen on XFS.
293 * sequence is considerably more expensive for us. This has a 285 * Warn if it does fail.
294 * noticeable performance impact for any file with cached pages,
295 * even when outside of the range of the particular I/O.
296 *
297 * Hence, amortize the cost of the lock against a full file
298 * flush and reduce the chances of repeated iolock cycles going
299 * forward.
300 */ 286 */
301 if (mapping->nrpages) { 287 ret = invalidate_inode_pages2_range(mapping,
302 ret = filemap_write_and_wait(mapping); 288 iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
303 if (ret) { 289 WARN_ON_ONCE(ret);
304 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 290 ret = 0;
305 return ret;
306 }
307
308 /*
309 * Invalidate whole pages. This can return an error if
310 * we fail to invalidate a page, but this should never
311 * happen on XFS. Warn if it does fail.
312 */
313 ret = invalidate_inode_pages2(mapping);
314 WARN_ON_ONCE(ret);
315 ret = 0;
316 }
317 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
318 } 291 }
319 292
320 data = *to; 293 data = *to;
@@ -324,8 +297,9 @@ xfs_file_dio_aio_read(
324 iocb->ki_pos += ret; 297 iocb->ki_pos += ret;
325 iov_iter_advance(to, ret); 298 iov_iter_advance(to, ret);
326 } 299 }
327 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
328 300
301out_unlock:
302 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
329 return ret; 303 return ret;
330} 304}
331 305
@@ -570,61 +544,49 @@ xfs_file_dio_aio_write(
570 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 544 if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
571 return -EINVAL; 545 return -EINVAL;
572 546
573 /* "unaligned" here means not aligned to a filesystem block */
574 if ((iocb->ki_pos & mp->m_blockmask) ||
575 ((iocb->ki_pos + count) & mp->m_blockmask))
576 unaligned_io = 1;
577
578 /* 547 /*
579 * We don't need to take an exclusive lock unless there page cache needs 548 * Don't take the exclusive iolock here unless the I/O is unaligned to
580 * to be invalidated or unaligned IO is being executed. We don't need to 549 * the file system block size. We don't need to consider the EOF
581 * consider the EOF extension case here because 550 * extension case here because xfs_file_aio_write_checks() will relock
582 * xfs_file_aio_write_checks() will relock the inode as necessary for 551 * the inode as necessary for EOF zeroing cases and fill out the new
583 * EOF zeroing cases and fill out the new inode size as appropriate. 552 * inode size as appropriate.
584 */ 553 */
585 if (unaligned_io || mapping->nrpages) 554 if ((iocb->ki_pos & mp->m_blockmask) ||
555 ((iocb->ki_pos + count) & mp->m_blockmask)) {
556 unaligned_io = 1;
586 iolock = XFS_IOLOCK_EXCL; 557 iolock = XFS_IOLOCK_EXCL;
587 else 558 } else {
588 iolock = XFS_IOLOCK_SHARED; 559 iolock = XFS_IOLOCK_SHARED;
589 xfs_rw_ilock(ip, iolock);
590
591 /*
592 * Recheck if there are cached pages that need invalidate after we got
593 * the iolock to protect against other threads adding new pages while
594 * we were waiting for the iolock.
595 */
596 if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
597 xfs_rw_iunlock(ip, iolock);
598 iolock = XFS_IOLOCK_EXCL;
599 xfs_rw_ilock(ip, iolock);
600 } 560 }
601 561
562 xfs_rw_ilock(ip, iolock);
563
602 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 564 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
603 if (ret) 565 if (ret)
604 goto out; 566 goto out;
605 count = iov_iter_count(from); 567 count = iov_iter_count(from);
606 end = iocb->ki_pos + count - 1; 568 end = iocb->ki_pos + count - 1;
607 569
608 /*
609 * See xfs_file_dio_aio_read() for why we do a full-file flush here.
610 */
611 if (mapping->nrpages) { 570 if (mapping->nrpages) {
612 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); 571 ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
613 if (ret) 572 if (ret)
614 goto out; 573 goto out;
574
615 /* 575 /*
616 * Invalidate whole pages. This can return an error if we fail 576 * Invalidate whole pages. This can return an error if we fail
617 * to invalidate a page, but this should never happen on XFS. 577 * to invalidate a page, but this should never happen on XFS.
618 * Warn if it does fail. 578 * Warn if it does fail.
619 */ 579 */
620 ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); 580 ret = invalidate_inode_pages2_range(mapping,
581 iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
621 WARN_ON_ONCE(ret); 582 WARN_ON_ONCE(ret);
622 ret = 0; 583 ret = 0;
623 } 584 }
624 585
625 /* 586 /*
626 * If we are doing unaligned IO, wait for all other IO to drain, 587 * If we are doing unaligned IO, wait for all other IO to drain,
627 * otherwise demote the lock if we had to flush cached pages 588 * otherwise demote the lock if we had to take the exclusive lock
589 * for other reasons in xfs_file_aio_write_checks.
628 */ 590 */
629 if (unaligned_io) 591 if (unaligned_io)
630 inode_dio_wait(inode); 592 inode_dio_wait(inode);
@@ -947,134 +909,6 @@ out_unlock:
947 return error; 909 return error;
948} 910}
949 911
950/*
951 * Flush all file writes out to disk.
952 */
953static int
954xfs_file_wait_for_io(
955 struct inode *inode,
956 loff_t offset,
957 size_t len)
958{
959 loff_t rounding;
960 loff_t ioffset;
961 loff_t iendoffset;
962 loff_t bs;
963 int ret;
964
965 bs = inode->i_sb->s_blocksize;
966 inode_dio_wait(inode);
967
968 rounding = max_t(xfs_off_t, bs, PAGE_SIZE);
969 ioffset = round_down(offset, rounding);
970 iendoffset = round_up(offset + len, rounding) - 1;
971 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
972 iendoffset);
973 return ret;
974}
975
976/* Hook up to the VFS reflink function */
977STATIC int
978xfs_file_share_range(
979 struct file *file_in,
980 loff_t pos_in,
981 struct file *file_out,
982 loff_t pos_out,
983 u64 len,
984 bool is_dedupe)
985{
986 struct inode *inode_in;
987 struct inode *inode_out;
988 ssize_t ret;
989 loff_t bs;
990 loff_t isize;
991 int same_inode;
992 loff_t blen;
993 unsigned int flags = 0;
994
995 inode_in = file_inode(file_in);
996 inode_out = file_inode(file_out);
997 bs = inode_out->i_sb->s_blocksize;
998
999 /* Don't touch certain kinds of inodes */
1000 if (IS_IMMUTABLE(inode_out))
1001 return -EPERM;
1002 if (IS_SWAPFILE(inode_in) ||
1003 IS_SWAPFILE(inode_out))
1004 return -ETXTBSY;
1005
1006 /* Reflink only works within this filesystem. */
1007 if (inode_in->i_sb != inode_out->i_sb)
1008 return -EXDEV;
1009 same_inode = (inode_in->i_ino == inode_out->i_ino);
1010
1011 /* Don't reflink dirs, pipes, sockets... */
1012 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1013 return -EISDIR;
1014 if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
1015 return -EINVAL;
1016 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1017 return -EINVAL;
1018
1019 /* Don't share DAX file data for now. */
1020 if (IS_DAX(inode_in) || IS_DAX(inode_out))
1021 return -EINVAL;
1022
1023 /* Are we going all the way to the end? */
1024 isize = i_size_read(inode_in);
1025 if (isize == 0)
1026 return 0;
1027 if (len == 0)
1028 len = isize - pos_in;
1029
1030 /* Ensure offsets don't wrap and the input is inside i_size */
1031 if (pos_in + len < pos_in || pos_out + len < pos_out ||
1032 pos_in + len > isize)
1033 return -EINVAL;
1034
1035 /* Don't allow dedupe past EOF in the dest file */
1036 if (is_dedupe) {
1037 loff_t disize;
1038
1039 disize = i_size_read(inode_out);
1040 if (pos_out >= disize || pos_out + len > disize)
1041 return -EINVAL;
1042 }
1043
1044 /* If we're linking to EOF, continue to the block boundary. */
1045 if (pos_in + len == isize)
1046 blen = ALIGN(isize, bs) - pos_in;
1047 else
1048 blen = len;
1049
1050 /* Only reflink if we're aligned to block boundaries */
1051 if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1052 !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1053 return -EINVAL;
1054
1055 /* Don't allow overlapped reflink within the same file */
1056 if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen)
1057 return -EINVAL;
1058
1059 /* Wait for the completion of any pending IOs on srcfile */
1060 ret = xfs_file_wait_for_io(inode_in, pos_in, len);
1061 if (ret)
1062 goto out;
1063 ret = xfs_file_wait_for_io(inode_out, pos_out, len);
1064 if (ret)
1065 goto out;
1066
1067 if (is_dedupe)
1068 flags |= XFS_REFLINK_DEDUPE;
1069 ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out),
1070 pos_out, len, flags);
1071 if (ret < 0)
1072 goto out;
1073
1074out:
1075 return ret;
1076}
1077
1078STATIC ssize_t 912STATIC ssize_t
1079xfs_file_copy_range( 913xfs_file_copy_range(
1080 struct file *file_in, 914 struct file *file_in,
@@ -1086,7 +920,7 @@ xfs_file_copy_range(
1086{ 920{
1087 int error; 921 int error;
1088 922
1089 error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, 923 error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
1090 len, false); 924 len, false);
1091 if (error) 925 if (error)
1092 return error; 926 return error;
@@ -1101,7 +935,7 @@ xfs_file_clone_range(
1101 loff_t pos_out, 935 loff_t pos_out,
1102 u64 len) 936 u64 len)
1103{ 937{
1104 return xfs_file_share_range(file_in, pos_in, file_out, pos_out, 938 return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
1105 len, false); 939 len, false);
1106} 940}
1107 941
@@ -1124,7 +958,7 @@ xfs_file_dedupe_range(
1124 if (len > XFS_MAX_DEDUPE_LEN) 958 if (len > XFS_MAX_DEDUPE_LEN)
1125 len = XFS_MAX_DEDUPE_LEN; 959 len = XFS_MAX_DEDUPE_LEN;
1126 960
1127 error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, 961 error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
1128 len, true); 962 len, true);
1129 if (error) 963 if (error)
1130 return error; 964 return error;