aboutsummaryrefslogtreecommitdiffstats
path: root/fs/read_write.c
diff options
context:
space:
mode:
authorZach Brown <zab@redhat.com>2015-11-10 16:53:30 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2015-12-01 14:00:53 -0500
commit29732938a6289a15e907da234d6692a2ead71855 (patch)
tree51b31c22095cac9bbe24278647e679b635e7ccec /fs/read_write.c
parent31ade3b83e1821da5fbb2f11b5b3d4ab2ec39db8 (diff)
vfs: add copy_file_range syscall and vfs helper
Add a copy_file_range() system call for offloading copies between regular files. This gives an interface to underlying layers of the storage stack which can copy without reading and writing all the data. There are a few candidates that should support copy offloading in the nearer term: - btrfs shares extent references with its clone ioctl - NFS has patches to add a COPY command which copies on the server - SCSI has a family of XCOPY commands which copy in the device This system call avoids the complexity of also accelerating the creation of the destination file by operating on an existing destination file descriptor, not a path. Currently the high level vfs entry point limits copy offloading to files on the same mount and super (and not in the same file). This can be relaxed if we get implementations which can copy between file systems safely. Signed-off-by: Zach Brown <zab@redhat.com> [Anna Schumaker: Change -EINVAL to -EBADF during file verification, Change flags parameter from int to unsigned int, Add function to include/linux/syscalls.h, Check copy len after file open mode, Don't forbid ranges inside the same file, Use rw_verify_area() to veriy ranges, Use file_out rather than file_in, Add COPY_FR_REFLINK flag] Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs/read_write.c')
-rw-r--r--fs/read_write.c120
1 files changed, 120 insertions, 0 deletions
diff --git a/fs/read_write.c b/fs/read_write.c
index 819ef3faf1bb..173140029a7a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/splice.h> 17#include <linux/splice.h>
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/mount.h>
19#include "internal.h" 20#include "internal.h"
20 21
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
@@ -1327,3 +1328,122 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1327 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1328 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1328} 1329}
1329#endif 1330#endif
1331
1332/*
1333 * copy_file_range() differs from regular file read and write in that it
1334 * specifically allows return partial success. When it does so is up to
1335 * the copy_file_range method.
1336 */
1337ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1338 struct file *file_out, loff_t pos_out,
1339 size_t len, unsigned int flags)
1340{
1341 struct inode *inode_in = file_inode(file_in);
1342 struct inode *inode_out = file_inode(file_out);
1343 ssize_t ret;
1344
1345 if (flags != 0)
1346 return -EINVAL;
1347
1348 /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
1349 ret = rw_verify_area(READ, file_in, &pos_in, len);
1350 if (ret >= 0)
1351 ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1352 if (ret < 0)
1353 return ret;
1354
1355 if (!(file_in->f_mode & FMODE_READ) ||
1356 !(file_out->f_mode & FMODE_WRITE) ||
1357 (file_out->f_flags & O_APPEND) ||
1358 !file_out->f_op->copy_file_range)
1359 return -EBADF;
1360
1361 /* this could be relaxed once a method supports cross-fs copies */
1362 if (inode_in->i_sb != inode_out->i_sb)
1363 return -EXDEV;
1364
1365 if (len == 0)
1366 return 0;
1367
1368 ret = mnt_want_write_file(file_out);
1369 if (ret)
1370 return ret;
1371
1372 ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
1373 len, flags);
1374 if (ret > 0) {
1375 fsnotify_access(file_in);
1376 add_rchar(current, ret);
1377 fsnotify_modify(file_out);
1378 add_wchar(current, ret);
1379 }
1380 inc_syscr(current);
1381 inc_syscw(current);
1382
1383 mnt_drop_write_file(file_out);
1384
1385 return ret;
1386}
1387EXPORT_SYMBOL(vfs_copy_file_range);
1388
1389SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1390 int, fd_out, loff_t __user *, off_out,
1391 size_t, len, unsigned int, flags)
1392{
1393 loff_t pos_in;
1394 loff_t pos_out;
1395 struct fd f_in;
1396 struct fd f_out;
1397 ssize_t ret = -EBADF;
1398
1399 f_in = fdget(fd_in);
1400 if (!f_in.file)
1401 goto out2;
1402
1403 f_out = fdget(fd_out);
1404 if (!f_out.file)
1405 goto out1;
1406
1407 ret = -EFAULT;
1408 if (off_in) {
1409 if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1410 goto out;
1411 } else {
1412 pos_in = f_in.file->f_pos;
1413 }
1414
1415 if (off_out) {
1416 if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1417 goto out;
1418 } else {
1419 pos_out = f_out.file->f_pos;
1420 }
1421
1422 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1423 flags);
1424 if (ret > 0) {
1425 pos_in += ret;
1426 pos_out += ret;
1427
1428 if (off_in) {
1429 if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1430 ret = -EFAULT;
1431 } else {
1432 f_in.file->f_pos = pos_in;
1433 }
1434
1435 if (off_out) {
1436 if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1437 ret = -EFAULT;
1438 } else {
1439 f_out.file->f_pos = pos_out;
1440 }
1441 }
1442
1443out:
1444 fdput(f_out);
1445out1:
1446 fdput(f_in);
1447out2:
1448 return ret;
1449}