aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_file.c14
-rw-r--r--fs/adfs/file.c8
-rw-r--r--fs/affs/file.c8
-rw-r--r--fs/afs/file.c8
-rw-r--r--fs/afs/internal.h3
-rw-r--r--fs/afs/write.c11
-rw-r--r--fs/aio.c90
-rw-r--r--fs/autofs4/inode.c2
-rw-r--r--fs/bfs/file.c8
-rw-r--r--fs/block_dev.c40
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/dev-replace.c5
-rw-r--r--fs/btrfs/disk-io.c5
-rw-r--r--fs/btrfs/extent-tree.c148
-rw-r--r--fs/btrfs/extent_io.c39
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file.c51
-rw-r--r--fs/btrfs/free-space-cache.c192
-rw-r--r--fs/btrfs/inode.c88
-rw-r--r--fs/btrfs/ioctl.c184
-rw-r--r--fs/btrfs/locking.c80
-rw-r--r--fs/btrfs/print-tree.c9
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/raid56.c5
-rw-r--r--fs/btrfs/reada.c9
-rw-r--r--fs/btrfs/scrub.c19
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/btrfs/sysfs.c32
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c2
-rw-r--r--fs/btrfs/transaction.c24
-rw-r--r--fs/btrfs/volumes.c66
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/zlib.c2
-rw-r--r--fs/ceph/acl.c6
-rw-r--r--fs/ceph/addr.c21
-rw-r--r--fs/ceph/caps.c246
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c185
-rw-r--r--fs/ceph/inode.c247
-rw-r--r--fs/ceph/mds_client.c9
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/super.h13
-rw-r--r--fs/cifs/cifs_unicode.c7
-rw-r--r--fs/cifs/cifsfs.c68
-rw-r--r--fs/cifs/cifsfs.h12
-rw-r--r--fs/cifs/file.c81
-rw-r--r--fs/cifs/link.c2
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/direct-io.c164
-rw-r--r--fs/dlm/lowcomms.c5
-rw-r--r--fs/ecryptfs/file.c13
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c7
-rw-r--r--fs/exofs/file.c10
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/ext2/file.c10
-rw-r--r--fs/ext2/inode.c10
-rw-r--r--fs/ext3/file.c10
-rw-r--r--fs/ext3/inode.c15
-rw-r--r--fs/ext4/balloc.c16
-rw-r--r--fs/ext4/ext4.h3
-rw-r--r--fs/ext4/file.c35
-rw-r--r--fs/ext4/ialloc.c23
-rw-r--r--fs/ext4/indirect.c38
-rw-r--r--fs/ext4/inode.c24
-rw-r--r--fs/ext4/mballoc.c8
-rw-r--r--fs/f2fs/data.c17
-rw-r--r--fs/f2fs/file.c10
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/file.c11
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fuse/cuse.c8
-rw-r--r--fs/fuse/file.c154
-rw-r--r--fs/fuse/fuse_i.h5
-rw-r--r--fs/gfs2/aops.c11
-rw-r--r--fs/gfs2/file.c30
-rw-r--r--fs/hfs/inode.c16
-rw-r--r--fs/hfsplus/inode.c15
-rw-r--r--fs/hostfs/hostfs_kern.c8
-rw-r--r--fs/hpfs/file.c8
-rw-r--r--fs/jffs2/file.c8
-rw-r--r--fs/jfs/file.c10
-rw-r--r--fs/jfs/inode.c8
-rw-r--r--fs/kernfs/file.c69
-rw-r--r--fs/locks.c2
-rw-r--r--fs/logfs/file.c8
-rw-r--r--fs/mbcache.c3
-rw-r--r--fs/minix/file.c8
-rw-r--r--fs/nfs/direct.c326
-rw-r--r--fs/nfs/file.c65
-rw-r--r--fs/nfs/inode.c76
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4file.c10
-rw-r--r--fs/nfs/nfs4namespace.c102
-rw-r--r--fs/nfs/nfs4proc.c2
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/nfs4proc.c9
-rw-r--r--fs/nfsd/nfs4state.c78
-rw-r--r--fs/nfsd/nfs4xdr.c16
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/inode.c9
-rw-r--r--fs/ntfs/file.c9
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c57
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c3
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c18
-rw-r--r--fs/ocfs2/file.c138
-rw-r--r--fs/ocfs2/namei.c145
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/refcounttree.c8
-rw-r--r--fs/ocfs2/super.c8
-rw-r--r--fs/omfs/file.c8
-rw-r--r--fs/open.c6
-rw-r--r--fs/pipe.c145
-rw-r--r--fs/proc/stat.c22
-rw-r--r--fs/ramfs/file-mmu.c10
-rw-r--r--fs/ramfs/file-nommu.c10
-rw-r--r--fs/read_write.c108
-rw-r--r--fs/reiserfs/file.c10
-rw-r--r--fs/reiserfs/inode.c10
-rw-r--r--fs/romfs/mmap-nommu.c4
-rw-r--r--fs/seq_file.c30
-rw-r--r--fs/splice.c195
-rw-r--r--fs/sysv/file.c8
-rw-r--r--fs/ubifs/file.c25
-rw-r--r--fs/udf/file.c19
-rw-r--r--fs/udf/inode.c10
-rw-r--r--fs/ufs/file.c8
-rw-r--r--fs/xfs/xfs_aops.c17
-rw-r--r--fs/xfs/xfs_file.c119
-rw-r--r--fs/xfs/xfs_trace.h1
141 files changed, 2597 insertions, 2228 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index c71e88602ff4..cc1cfae726b3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -259,8 +259,7 @@ static int v9fs_launder_page(struct page *page)
259 * 259 *
260 */ 260 */
261static ssize_t 261static ssize_t
262v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 262v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
263 loff_t pos, unsigned long nr_segs)
264{ 263{
265 /* 264 /*
266 * FIXME 265 * FIXME
@@ -269,7 +268,7 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
269 */ 268 */
270 p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n", 269 p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
271 iocb->ki_filp->f_path.dentry->d_name.name, 270 iocb->ki_filp->f_path.dentry->d_name.name,
272 (long long)pos, nr_segs); 271 (long long)pos, iter->nr_segs);
273 272
274 return -EINVAL; 273 return -EINVAL;
275} 274}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 96e550760699..520c11c2dcca 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -692,7 +692,7 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
692{ 692{
693 if (filp->f_flags & O_DIRECT) 693 if (filp->f_flags & O_DIRECT)
694 return v9fs_direct_read(filp, data, count, offset); 694 return v9fs_direct_read(filp, data, count, offset);
695 return do_sync_read(filp, data, count, offset); 695 return new_sync_read(filp, data, count, offset);
696} 696}
697 697
698/** 698/**
@@ -760,7 +760,7 @@ err_out:
760 760
761buff_write: 761buff_write:
762 mutex_unlock(&inode->i_mutex); 762 mutex_unlock(&inode->i_mutex);
763 return do_sync_write(filp, data, count, offsetp); 763 return new_sync_write(filp, data, count, offsetp);
764} 764}
765 765
766/** 766/**
@@ -778,7 +778,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
778 778
779 if (filp->f_flags & O_DIRECT) 779 if (filp->f_flags & O_DIRECT)
780 return v9fs_direct_write(filp, data, count, offset); 780 return v9fs_direct_write(filp, data, count, offset);
781 return do_sync_write(filp, data, count, offset); 781 return new_sync_write(filp, data, count, offset);
782} 782}
783 783
784 784
@@ -847,8 +847,8 @@ const struct file_operations v9fs_cached_file_operations = {
847 .llseek = generic_file_llseek, 847 .llseek = generic_file_llseek,
848 .read = v9fs_cached_file_read, 848 .read = v9fs_cached_file_read,
849 .write = v9fs_cached_file_write, 849 .write = v9fs_cached_file_write,
850 .aio_read = generic_file_aio_read, 850 .read_iter = generic_file_read_iter,
851 .aio_write = generic_file_aio_write, 851 .write_iter = generic_file_write_iter,
852 .open = v9fs_file_open, 852 .open = v9fs_file_open,
853 .release = v9fs_dir_release, 853 .release = v9fs_dir_release,
854 .lock = v9fs_file_lock, 854 .lock = v9fs_file_lock,
@@ -860,8 +860,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = {
860 .llseek = generic_file_llseek, 860 .llseek = generic_file_llseek,
861 .read = v9fs_cached_file_read, 861 .read = v9fs_cached_file_read,
862 .write = v9fs_cached_file_write, 862 .write = v9fs_cached_file_write,
863 .aio_read = generic_file_aio_read, 863 .read_iter = generic_file_read_iter,
864 .aio_write = generic_file_aio_write, 864 .write_iter = generic_file_write_iter,
865 .open = v9fs_file_open, 865 .open = v9fs_file_open,
866 .release = v9fs_dir_release, 866 .release = v9fs_dir_release,
867 .lock = v9fs_file_lock_dotl, 867 .lock = v9fs_file_lock_dotl,
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index a36da5382b40..07c9edce5aa7 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -23,12 +23,12 @@
23 23
24const struct file_operations adfs_file_operations = { 24const struct file_operations adfs_file_operations = {
25 .llseek = generic_file_llseek, 25 .llseek = generic_file_llseek,
26 .read = do_sync_read, 26 .read = new_sync_read,
27 .aio_read = generic_file_aio_read, 27 .read_iter = generic_file_read_iter,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = generic_file_fsync, 29 .fsync = generic_file_fsync,
30 .write = do_sync_write, 30 .write = new_sync_write,
31 .aio_write = generic_file_aio_write, 31 .write_iter = generic_file_write_iter,
32 .splice_read = generic_file_splice_read, 32 .splice_read = generic_file_splice_read,
33}; 33};
34 34
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0270303388ee..a7fe57d2cd9a 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -27,10 +27,10 @@ static int affs_file_release(struct inode *inode, struct file *filp);
27 27
28const struct file_operations affs_file_operations = { 28const struct file_operations affs_file_operations = {
29 .llseek = generic_file_llseek, 29 .llseek = generic_file_llseek,
30 .read = do_sync_read, 30 .read = new_sync_read,
31 .aio_read = generic_file_aio_read, 31 .read_iter = generic_file_read_iter,
32 .write = do_sync_write, 32 .write = new_sync_write,
33 .aio_write = generic_file_aio_write, 33 .write_iter = generic_file_write_iter,
34 .mmap = generic_file_mmap, 34 .mmap = generic_file_mmap,
35 .open = affs_file_open, 35 .open = affs_file_open,
36 .release = affs_file_release, 36 .release = affs_file_release,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 66d50fe2ee45..932ce07948b3 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -31,10 +31,10 @@ const struct file_operations afs_file_operations = {
31 .open = afs_open, 31 .open = afs_open,
32 .release = afs_release, 32 .release = afs_release,
33 .llseek = generic_file_llseek, 33 .llseek = generic_file_llseek,
34 .read = do_sync_read, 34 .read = new_sync_read,
35 .write = do_sync_write, 35 .write = new_sync_write,
36 .aio_read = generic_file_aio_read, 36 .read_iter = generic_file_read_iter,
37 .aio_write = afs_file_write, 37 .write_iter = afs_file_write,
38 .mmap = generic_file_readonly_mmap, 38 .mmap = generic_file_readonly_mmap,
39 .splice_read = generic_file_splice_read, 39 .splice_read = generic_file_splice_read,
40 .fsync = afs_fsync, 40 .fsync = afs_fsync,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 590b55f46d61..71d5982312f3 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -747,8 +747,7 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
747extern int afs_writepage(struct page *, struct writeback_control *); 747extern int afs_writepage(struct page *, struct writeback_control *);
748extern int afs_writepages(struct address_space *, struct writeback_control *); 748extern int afs_writepages(struct address_space *, struct writeback_control *);
749extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); 749extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
750extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 750extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
751 unsigned long, loff_t);
752extern int afs_writeback_all(struct afs_vnode *); 751extern int afs_writeback_all(struct afs_vnode *);
753extern int afs_fsync(struct file *, loff_t, loff_t, int); 752extern int afs_fsync(struct file *, loff_t, loff_t, int);
754 753
diff --git a/fs/afs/write.c b/fs/afs/write.c
index a890db4b9898..ab6adfd52516 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -625,15 +625,14 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
625/* 625/*
626 * write to an AFS file 626 * write to an AFS file
627 */ 627 */
628ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, 628ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
629 unsigned long nr_segs, loff_t pos)
630{ 629{
631 struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); 630 struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
632 ssize_t result; 631 ssize_t result;
633 size_t count = iov_length(iov, nr_segs); 632 size_t count = iov_iter_count(from);
634 633
635 _enter("{%x.%u},{%zu},%lu,", 634 _enter("{%x.%u},{%zu},",
636 vnode->fid.vid, vnode->fid.vnode, count, nr_segs); 635 vnode->fid.vid, vnode->fid.vnode, count);
637 636
638 if (IS_SWAPFILE(&vnode->vfs_inode)) { 637 if (IS_SWAPFILE(&vnode->vfs_inode)) {
639 printk(KERN_INFO 638 printk(KERN_INFO
@@ -644,7 +643,7 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
644 if (!count) 643 if (!count)
645 return 0; 644 return 0;
646 645
647 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 646 result = generic_file_write_iter(iocb, from);
648 if (IS_ERR_VALUE(result)) { 647 if (IS_ERR_VALUE(result)) {
649 _leave(" = %zd", result); 648 _leave(" = %zd", result);
650 return result; 649 return result;
diff --git a/fs/aio.c b/fs/aio.c
index a0ed6c7d2cd2..955947ef3e02 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -477,7 +477,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
477} 477}
478EXPORT_SYMBOL(kiocb_set_cancel_fn); 478EXPORT_SYMBOL(kiocb_set_cancel_fn);
479 479
480static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) 480static int kiocb_cancel(struct kiocb *kiocb)
481{ 481{
482 kiocb_cancel_fn *old, *cancel; 482 kiocb_cancel_fn *old, *cancel;
483 483
@@ -538,7 +538,7 @@ static void free_ioctx_users(struct percpu_ref *ref)
538 struct kiocb, ki_list); 538 struct kiocb, ki_list);
539 539
540 list_del_init(&req->ki_list); 540 list_del_init(&req->ki_list);
541 kiocb_cancel(ctx, req); 541 kiocb_cancel(req);
542 } 542 }
543 543
544 spin_unlock_irq(&ctx->ctx_lock); 544 spin_unlock_irq(&ctx->ctx_lock);
@@ -727,42 +727,42 @@ err:
727 * when the processes owning a context have all exited to encourage 727 * when the processes owning a context have all exited to encourage
728 * the rapid destruction of the kioctx. 728 * the rapid destruction of the kioctx.
729 */ 729 */
730static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, 730static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
731 struct completion *requests_done) 731 struct completion *requests_done)
732{ 732{
733 if (!atomic_xchg(&ctx->dead, 1)) { 733 struct kioctx_table *table;
734 struct kioctx_table *table;
735 734
736 spin_lock(&mm->ioctx_lock); 735 if (atomic_xchg(&ctx->dead, 1))
737 rcu_read_lock(); 736 return -EINVAL;
738 table = rcu_dereference(mm->ioctx_table);
739 737
740 WARN_ON(ctx != table->table[ctx->id]);
741 table->table[ctx->id] = NULL;
742 rcu_read_unlock();
743 spin_unlock(&mm->ioctx_lock);
744 738
745 /* percpu_ref_kill() will do the necessary call_rcu() */ 739 spin_lock(&mm->ioctx_lock);
746 wake_up_all(&ctx->wait); 740 rcu_read_lock();
741 table = rcu_dereference(mm->ioctx_table);
747 742
748 /* 743 WARN_ON(ctx != table->table[ctx->id]);
749 * It'd be more correct to do this in free_ioctx(), after all 744 table->table[ctx->id] = NULL;
750 * the outstanding kiocbs have finished - but by then io_destroy 745 rcu_read_unlock();
751 * has already returned, so io_setup() could potentially return 746 spin_unlock(&mm->ioctx_lock);
752 * -EAGAIN with no ioctxs actually in use (as far as userspace
753 * could tell).
754 */
755 aio_nr_sub(ctx->max_reqs);
756 747
757 if (ctx->mmap_size) 748 /* percpu_ref_kill() will do the necessary call_rcu() */
758 vm_munmap(ctx->mmap_base, ctx->mmap_size); 749 wake_up_all(&ctx->wait);
759 750
760 ctx->requests_done = requests_done; 751 /*
761 percpu_ref_kill(&ctx->users); 752 * It'd be more correct to do this in free_ioctx(), after all
762 } else { 753 * the outstanding kiocbs have finished - but by then io_destroy
763 if (requests_done) 754 * has already returned, so io_setup() could potentially return
764 complete(requests_done); 755 * -EAGAIN with no ioctxs actually in use (as far as userspace
765 } 756 * could tell).
757 */
758 aio_nr_sub(ctx->max_reqs);
759
760 if (ctx->mmap_size)
761 vm_munmap(ctx->mmap_base, ctx->mmap_size);
762
763 ctx->requests_done = requests_done;
764 percpu_ref_kill(&ctx->users);
765 return 0;
766} 766}
767 767
768/* wait_on_sync_kiocb: 768/* wait_on_sync_kiocb:
@@ -1021,6 +1021,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1021 1021
1022 /* everything turned out well, dispose of the aiocb. */ 1022 /* everything turned out well, dispose of the aiocb. */
1023 kiocb_free(iocb); 1023 kiocb_free(iocb);
1024 put_reqs_available(ctx, 1);
1024 1025
1025 /* 1026 /*
1026 * We have to order our ring_info tail store above and test 1027 * We have to order our ring_info tail store above and test
@@ -1062,6 +1063,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
1062 if (head == tail) 1063 if (head == tail)
1063 goto out; 1064 goto out;
1064 1065
1066 head %= ctx->nr_events;
1067 tail %= ctx->nr_events;
1068
1065 while (ret < nr) { 1069 while (ret < nr) {
1066 long avail; 1070 long avail;
1067 struct io_event *ev; 1071 struct io_event *ev;
@@ -1100,8 +1104,6 @@ static long aio_read_events_ring(struct kioctx *ctx,
1100 flush_dcache_page(ctx->ring_pages[0]); 1104 flush_dcache_page(ctx->ring_pages[0]);
1101 1105
1102 pr_debug("%li h%u t%u\n", ret, head, tail); 1106 pr_debug("%li h%u t%u\n", ret, head, tail);
1103
1104 put_reqs_available(ctx, ret);
1105out: 1107out:
1106 mutex_unlock(&ctx->ring_lock); 1108 mutex_unlock(&ctx->ring_lock);
1107 1109
@@ -1219,21 +1221,23 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1219 if (likely(NULL != ioctx)) { 1221 if (likely(NULL != ioctx)) {
1220 struct completion requests_done = 1222 struct completion requests_done =
1221 COMPLETION_INITIALIZER_ONSTACK(requests_done); 1223 COMPLETION_INITIALIZER_ONSTACK(requests_done);
1224 int ret;
1222 1225
1223 /* Pass requests_done to kill_ioctx() where it can be set 1226 /* Pass requests_done to kill_ioctx() where it can be set
1224 * in a thread-safe way. If we try to set it here then we have 1227 * in a thread-safe way. If we try to set it here then we have
1225 * a race condition if two io_destroy() called simultaneously. 1228 * a race condition if two io_destroy() called simultaneously.
1226 */ 1229 */
1227 kill_ioctx(current->mm, ioctx, &requests_done); 1230 ret = kill_ioctx(current->mm, ioctx, &requests_done);
1228 percpu_ref_put(&ioctx->users); 1231 percpu_ref_put(&ioctx->users);
1229 1232
1230 /* Wait until all IO for the context are done. Otherwise kernel 1233 /* Wait until all IO for the context are done. Otherwise kernel
1231 * keep using user-space buffers even if user thinks the context 1234 * keep using user-space buffers even if user thinks the context
1232 * is destroyed. 1235 * is destroyed.
1233 */ 1236 */
1234 wait_for_completion(&requests_done); 1237 if (!ret)
1238 wait_for_completion(&requests_done);
1235 1239
1236 return 0; 1240 return ret;
1237 } 1241 }
1238 pr_debug("EINVAL: io_destroy: invalid context id\n"); 1242 pr_debug("EINVAL: io_destroy: invalid context id\n");
1239 return -EINVAL; 1243 return -EINVAL;
@@ -1241,6 +1245,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1241 1245
1242typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, 1246typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
1243 unsigned long, loff_t); 1247 unsigned long, loff_t);
1248typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
1244 1249
1245static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, 1250static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
1246 int rw, char __user *buf, 1251 int rw, char __user *buf,
@@ -1298,7 +1303,9 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
1298 int rw; 1303 int rw;
1299 fmode_t mode; 1304 fmode_t mode;
1300 aio_rw_op *rw_op; 1305 aio_rw_op *rw_op;
1306 rw_iter_op *iter_op;
1301 struct iovec inline_vec, *iovec = &inline_vec; 1307 struct iovec inline_vec, *iovec = &inline_vec;
1308 struct iov_iter iter;
1302 1309
1303 switch (opcode) { 1310 switch (opcode) {
1304 case IOCB_CMD_PREAD: 1311 case IOCB_CMD_PREAD:
@@ -1306,6 +1313,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
1306 mode = FMODE_READ; 1313 mode = FMODE_READ;
1307 rw = READ; 1314 rw = READ;
1308 rw_op = file->f_op->aio_read; 1315 rw_op = file->f_op->aio_read;
1316 iter_op = file->f_op->read_iter;
1309 goto rw_common; 1317 goto rw_common;
1310 1318
1311 case IOCB_CMD_PWRITE: 1319 case IOCB_CMD_PWRITE:
@@ -1313,12 +1321,13 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
1313 mode = FMODE_WRITE; 1321 mode = FMODE_WRITE;
1314 rw = WRITE; 1322 rw = WRITE;
1315 rw_op = file->f_op->aio_write; 1323 rw_op = file->f_op->aio_write;
1324 iter_op = file->f_op->write_iter;
1316 goto rw_common; 1325 goto rw_common;
1317rw_common: 1326rw_common:
1318 if (unlikely(!(file->f_mode & mode))) 1327 if (unlikely(!(file->f_mode & mode)))
1319 return -EBADF; 1328 return -EBADF;
1320 1329
1321 if (!rw_op) 1330 if (!rw_op && !iter_op)
1322 return -EINVAL; 1331 return -EINVAL;
1323 1332
1324 ret = (opcode == IOCB_CMD_PREADV || 1333 ret = (opcode == IOCB_CMD_PREADV ||
@@ -1347,7 +1356,12 @@ rw_common:
1347 if (rw == WRITE) 1356 if (rw == WRITE)
1348 file_start_write(file); 1357 file_start_write(file);
1349 1358
1350 ret = rw_op(req, iovec, nr_segs, req->ki_pos); 1359 if (iter_op) {
1360 iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
1361 ret = iter_op(req, &iter);
1362 } else {
1363 ret = rw_op(req, iovec, nr_segs, req->ki_pos);
1364 }
1351 1365
1352 if (rw == WRITE) 1366 if (rw == WRITE)
1353 file_end_write(file); 1367 file_end_write(file);
@@ -1585,7 +1599,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1585 1599
1586 kiocb = lookup_kiocb(ctx, iocb, key); 1600 kiocb = lookup_kiocb(ctx, iocb, key);
1587 if (kiocb) 1601 if (kiocb)
1588 ret = kiocb_cancel(ctx, kiocb); 1602 ret = kiocb_cancel(kiocb);
1589 else 1603 else
1590 ret = -EINVAL; 1604 ret = -EINVAL;
1591 1605
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index d7bd395ab586..1c55388ae633 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -210,7 +210,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
210 int pipefd; 210 int pipefd;
211 struct autofs_sb_info *sbi; 211 struct autofs_sb_info *sbi;
212 struct autofs_info *ino; 212 struct autofs_info *ino;
213 int pgrp; 213 int pgrp = 0;
214 bool pgrp_set = false; 214 bool pgrp_set = false;
215 int ret = -EINVAL; 215 int ret = -EINVAL;
216 216
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index ae2892218335..e7f88ace1a25 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -23,10 +23,10 @@
23 23
24const struct file_operations bfs_file_operations = { 24const struct file_operations bfs_file_operations = {
25 .llseek = generic_file_llseek, 25 .llseek = generic_file_llseek,
26 .read = do_sync_read, 26 .read = new_sync_read,
27 .aio_read = generic_file_aio_read, 27 .read_iter = generic_file_read_iter,
28 .write = do_sync_write, 28 .write = new_sync_write,
29 .aio_write = generic_file_aio_write, 29 .write_iter = generic_file_write_iter,
30 .mmap = generic_file_mmap, 30 .mmap = generic_file_mmap,
31 .splice_read = generic_file_splice_read, 31 .splice_read = generic_file_splice_read,
32}; 32};
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 83fba15cc394..6d7274619bf9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -165,14 +165,15 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
165} 165}
166 166
167static ssize_t 167static ssize_t
168blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 168blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
169 loff_t offset, unsigned long nr_segs) 169 loff_t offset)
170{ 170{
171 struct file *file = iocb->ki_filp; 171 struct file *file = iocb->ki_filp;
172 struct inode *inode = file->f_mapping->host; 172 struct inode *inode = file->f_mapping->host;
173 173
174 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, 174 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
175 nr_segs, blkdev_get_block, NULL, NULL, 0); 175 offset, blkdev_get_block,
176 NULL, NULL, 0);
176} 177}
177 178
178int __sync_blockdev(struct block_device *bdev, int wait) 179int __sync_blockdev(struct block_device *bdev, int wait)
@@ -1571,43 +1572,38 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1571 * Does not take i_mutex for the write and thus is not for general purpose 1572 * Does not take i_mutex for the write and thus is not for general purpose
1572 * use. 1573 * use.
1573 */ 1574 */
1574ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, 1575ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
1575 unsigned long nr_segs, loff_t pos)
1576{ 1576{
1577 struct file *file = iocb->ki_filp; 1577 struct file *file = iocb->ki_filp;
1578 struct blk_plug plug; 1578 struct blk_plug plug;
1579 ssize_t ret; 1579 ssize_t ret;
1580 1580
1581 BUG_ON(iocb->ki_pos != pos);
1582
1583 blk_start_plug(&plug); 1581 blk_start_plug(&plug);
1584 ret = __generic_file_aio_write(iocb, iov, nr_segs); 1582 ret = __generic_file_write_iter(iocb, from);
1585 if (ret > 0) { 1583 if (ret > 0) {
1586 ssize_t err; 1584 ssize_t err;
1587 1585 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
1588 err = generic_write_sync(file, pos, ret);
1589 if (err < 0) 1586 if (err < 0)
1590 ret = err; 1587 ret = err;
1591 } 1588 }
1592 blk_finish_plug(&plug); 1589 blk_finish_plug(&plug);
1593 return ret; 1590 return ret;
1594} 1591}
1595EXPORT_SYMBOL_GPL(blkdev_aio_write); 1592EXPORT_SYMBOL_GPL(blkdev_write_iter);
1596 1593
1597static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, 1594static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
1598 unsigned long nr_segs, loff_t pos)
1599{ 1595{
1600 struct file *file = iocb->ki_filp; 1596 struct file *file = iocb->ki_filp;
1601 struct inode *bd_inode = file->f_mapping->host; 1597 struct inode *bd_inode = file->f_mapping->host;
1602 loff_t size = i_size_read(bd_inode); 1598 loff_t size = i_size_read(bd_inode);
1599 loff_t pos = iocb->ki_pos;
1603 1600
1604 if (pos >= size) 1601 if (pos >= size)
1605 return 0; 1602 return 0;
1606 1603
1607 size -= pos; 1604 size -= pos;
1608 if (size < iocb->ki_nbytes) 1605 iov_iter_truncate(to, size);
1609 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); 1606 return generic_file_read_iter(iocb, to);
1610 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1611} 1607}
1612 1608
1613/* 1609/*
@@ -1639,10 +1635,10 @@ const struct file_operations def_blk_fops = {
1639 .open = blkdev_open, 1635 .open = blkdev_open,
1640 .release = blkdev_close, 1636 .release = blkdev_close,
1641 .llseek = block_llseek, 1637 .llseek = block_llseek,
1642 .read = do_sync_read, 1638 .read = new_sync_read,
1643 .write = do_sync_write, 1639 .write = new_sync_write,
1644 .aio_read = blkdev_aio_read, 1640 .read_iter = blkdev_read_iter,
1645 .aio_write = blkdev_aio_write, 1641 .write_iter = blkdev_write_iter,
1646 .mmap = generic_file_mmap, 1642 .mmap = generic_file_mmap,
1647 .fsync = blkdev_fsync, 1643 .fsync = blkdev_fsync,
1648 .unlocked_ioctl = block_ioctl, 1644 .unlocked_ioctl = block_ioctl,
@@ -1650,7 +1646,7 @@ const struct file_operations def_blk_fops = {
1650 .compat_ioctl = compat_blkdev_ioctl, 1646 .compat_ioctl = compat_blkdev_ioctl,
1651#endif 1647#endif
1652 .splice_read = generic_file_splice_read, 1648 .splice_read = generic_file_splice_read,
1653 .splice_write = generic_file_splice_write, 1649 .splice_write = iter_file_splice_write,
1654}; 1650};
1655 1651
1656int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1652int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 92371c414228..1daea0b47187 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace)
821 821
822 spin_lock(workspace_lock); 822 spin_lock(workspace_lock);
823 if (*num_workspace < num_online_cpus()) { 823 if (*num_workspace < num_online_cpus()) {
824 list_add_tail(workspace, idle_workspace); 824 list_add(workspace, idle_workspace);
825 (*num_workspace)++; 825 (*num_workspace)++;
826 spin_unlock(workspace_lock); 826 spin_unlock(workspace_lock);
827 goto wake; 827 goto wake;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7e2c1c1ef36..be91397f4e92 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1259,11 +1259,19 @@ struct btrfs_block_group_cache {
1259 spinlock_t lock; 1259 spinlock_t lock;
1260 u64 pinned; 1260 u64 pinned;
1261 u64 reserved; 1261 u64 reserved;
1262 u64 delalloc_bytes;
1262 u64 bytes_super; 1263 u64 bytes_super;
1263 u64 flags; 1264 u64 flags;
1264 u64 sectorsize; 1265 u64 sectorsize;
1265 u64 cache_generation; 1266 u64 cache_generation;
1266 1267
1268 /*
1269 * It is just used for the delayed data space allocation because
1270 * only the data space allocation and the relative metadata update
1271 * can be done cross the transaction.
1272 */
1273 struct rw_semaphore data_rwsem;
1274
1267 /* for raid56, this is a full stripe, without parity */ 1275 /* for raid56, this is a full stripe, without parity */
1268 unsigned long full_stripe_len; 1276 unsigned long full_stripe_len;
1269 1277
@@ -3316,7 +3324,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
3316 struct btrfs_key *ins); 3324 struct btrfs_key *ins);
3317int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, 3325int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
3318 u64 min_alloc_size, u64 empty_size, u64 hint_byte, 3326 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
3319 struct btrfs_key *ins, int is_data); 3327 struct btrfs_key *ins, int is_data, int delalloc);
3320int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3328int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3321 struct extent_buffer *buf, int full_backref, int no_quota); 3329 struct extent_buffer *buf, int full_backref, int no_quota);
3322int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3330int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3330,7 +3338,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3330 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 3338 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
3331 u64 owner, u64 offset, int no_quota); 3339 u64 owner, u64 offset, int no_quota);
3332 3340
3333int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 3341int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
3342 int delalloc);
3334int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 3343int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
3335 u64 start, u64 len); 3344 u64 start, u64 len);
3336void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3345void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2af6e66fe788..eea26e1b2fda 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -36,6 +36,7 @@
36#include "check-integrity.h" 36#include "check-integrity.h"
37#include "rcu-string.h" 37#include "rcu-string.h"
38#include "dev-replace.h" 38#include "dev-replace.h"
39#include "sysfs.h"
39 40
40static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 41static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
41 int scrub_ret); 42 int scrub_ret);
@@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
562 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 563 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
563 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 564 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
564 565
566 /* replace the sysfs entry */
567 btrfs_kobj_rm_device(fs_info, src_device);
568 btrfs_kobj_add_device(fs_info, tgt_device);
569
565 btrfs_rm_dev_replace_blocked(fs_info); 570 btrfs_rm_dev_replace_blocked(fs_info);
566 571
567 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 572 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8bb4aa19898f..08e65e9cf2aa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -369,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
369out: 369out:
370 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, 370 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
371 &cached_state, GFP_NOFS); 371 &cached_state, GFP_NOFS);
372 btrfs_tree_read_unlock_blocking(eb); 372 if (need_lock)
373 btrfs_tree_read_unlock_blocking(eb);
373 return ret; 374 return ret;
374} 375}
375 376
@@ -2904,7 +2905,9 @@ retry_root_backup:
2904 if (ret) 2905 if (ret)
2905 goto fail_qgroup; 2906 goto fail_qgroup;
2906 2907
2908 mutex_lock(&fs_info->cleaner_mutex);
2907 ret = btrfs_recover_relocation(tree_root); 2909 ret = btrfs_recover_relocation(tree_root);
2910 mutex_unlock(&fs_info->cleaner_mutex);
2908 if (ret < 0) { 2911 if (ret < 0) {
2909 printk(KERN_WARNING 2912 printk(KERN_WARNING
2910 "BTRFS: failed to recover relocation\n"); 2913 "BTRFS: failed to recover relocation\n");
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fafb3e53ecde..813537f362f9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -105,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,
105static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 105static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
106 int dump_block_groups); 106 int dump_block_groups);
107static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 107static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
108 u64 num_bytes, int reserve); 108 u64 num_bytes, int reserve,
109 int delalloc);
109static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 110static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
110 u64 num_bytes); 111 u64 num_bytes);
111int btrfs_pin_extent(struct btrfs_root *root, 112int btrfs_pin_extent(struct btrfs_root *root,
@@ -3260,7 +3261,8 @@ again:
3260 3261
3261 spin_lock(&block_group->lock); 3262 spin_lock(&block_group->lock);
3262 if (block_group->cached != BTRFS_CACHE_FINISHED || 3263 if (block_group->cached != BTRFS_CACHE_FINISHED ||
3263 !btrfs_test_opt(root, SPACE_CACHE)) { 3264 !btrfs_test_opt(root, SPACE_CACHE) ||
3265 block_group->delalloc_bytes) {
3264 /* 3266 /*
3265 * don't bother trying to write stuff out _if_ 3267 * don't bother trying to write stuff out _if_
3266 * a) we're not cached, 3268 * a) we're not cached,
@@ -5613,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
5613 * @cache: The cache we are manipulating 5615 * @cache: The cache we are manipulating
5614 * @num_bytes: The number of bytes in question 5616 * @num_bytes: The number of bytes in question
5615 * @reserve: One of the reservation enums 5617 * @reserve: One of the reservation enums
5618 * @delalloc: The blocks are allocated for the delalloc write
5616 * 5619 *
5617 * This is called by the allocator when it reserves space, or by somebody who is 5620 * This is called by the allocator when it reserves space, or by somebody who is
5618 * freeing space that was never actually used on disk. For example if you 5621 * freeing space that was never actually used on disk. For example if you
@@ -5631,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
5631 * succeeds. 5634 * succeeds.
5632 */ 5635 */
5633static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5636static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5634 u64 num_bytes, int reserve) 5637 u64 num_bytes, int reserve, int delalloc)
5635{ 5638{
5636 struct btrfs_space_info *space_info = cache->space_info; 5639 struct btrfs_space_info *space_info = cache->space_info;
5637 int ret = 0; 5640 int ret = 0;
@@ -5650,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5650 num_bytes, 0); 5653 num_bytes, 0);
5651 space_info->bytes_may_use -= num_bytes; 5654 space_info->bytes_may_use -= num_bytes;
5652 } 5655 }
5656
5657 if (delalloc)
5658 cache->delalloc_bytes += num_bytes;
5653 } 5659 }
5654 } else { 5660 } else {
5655 if (cache->ro) 5661 if (cache->ro)
5656 space_info->bytes_readonly += num_bytes; 5662 space_info->bytes_readonly += num_bytes;
5657 cache->reserved -= num_bytes; 5663 cache->reserved -= num_bytes;
5658 space_info->bytes_reserved -= num_bytes; 5664 space_info->bytes_reserved -= num_bytes;
5665
5666 if (delalloc)
5667 cache->delalloc_bytes -= num_bytes;
5659 } 5668 }
5660 spin_unlock(&cache->lock); 5669 spin_unlock(&cache->lock);
5661 spin_unlock(&space_info->lock); 5670 spin_unlock(&space_info->lock);
@@ -5669,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5669 struct btrfs_caching_control *next; 5678 struct btrfs_caching_control *next;
5670 struct btrfs_caching_control *caching_ctl; 5679 struct btrfs_caching_control *caching_ctl;
5671 struct btrfs_block_group_cache *cache; 5680 struct btrfs_block_group_cache *cache;
5672 struct btrfs_space_info *space_info;
5673 5681
5674 down_write(&fs_info->commit_root_sem); 5682 down_write(&fs_info->commit_root_sem);
5675 5683
@@ -5692,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5692 5700
5693 up_write(&fs_info->commit_root_sem); 5701 up_write(&fs_info->commit_root_sem);
5694 5702
5695 list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5696 percpu_counter_set(&space_info->total_bytes_pinned, 0);
5697
5698 update_global_block_rsv(fs_info); 5703 update_global_block_rsv(fs_info);
5699} 5704}
5700 5705
@@ -5732,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5732 spin_lock(&cache->lock); 5737 spin_lock(&cache->lock);
5733 cache->pinned -= len; 5738 cache->pinned -= len;
5734 space_info->bytes_pinned -= len; 5739 space_info->bytes_pinned -= len;
5740 percpu_counter_add(&space_info->total_bytes_pinned, -len);
5735 if (cache->ro) { 5741 if (cache->ro) {
5736 space_info->bytes_readonly += len; 5742 space_info->bytes_readonly += len;
5737 readonly = true; 5743 readonly = true;
@@ -6206,7 +6212,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6206 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 6212 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6207 6213
6208 btrfs_add_free_space(cache, buf->start, buf->len); 6214 btrfs_add_free_space(cache, buf->start, buf->len);
6209 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 6215 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6210 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6216 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6211 pin = 0; 6217 pin = 0;
6212 } 6218 }
@@ -6365,6 +6371,70 @@ enum btrfs_loop_type {
6365 LOOP_NO_EMPTY_SIZE = 3, 6371 LOOP_NO_EMPTY_SIZE = 3,
6366}; 6372};
6367 6373
6374static inline void
6375btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6376 int delalloc)
6377{
6378 if (delalloc)
6379 down_read(&cache->data_rwsem);
6380}
6381
6382static inline void
6383btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6384 int delalloc)
6385{
6386 btrfs_get_block_group(cache);
6387 if (delalloc)
6388 down_read(&cache->data_rwsem);
6389}
6390
6391static struct btrfs_block_group_cache *
6392btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6393 struct btrfs_free_cluster *cluster,
6394 int delalloc)
6395{
6396 struct btrfs_block_group_cache *used_bg;
6397 bool locked = false;
6398again:
6399 spin_lock(&cluster->refill_lock);
6400 if (locked) {
6401 if (used_bg == cluster->block_group)
6402 return used_bg;
6403
6404 up_read(&used_bg->data_rwsem);
6405 btrfs_put_block_group(used_bg);
6406 }
6407
6408 used_bg = cluster->block_group;
6409 if (!used_bg)
6410 return NULL;
6411
6412 if (used_bg == block_group)
6413 return used_bg;
6414
6415 btrfs_get_block_group(used_bg);
6416
6417 if (!delalloc)
6418 return used_bg;
6419
6420 if (down_read_trylock(&used_bg->data_rwsem))
6421 return used_bg;
6422
6423 spin_unlock(&cluster->refill_lock);
6424 down_read(&used_bg->data_rwsem);
6425 locked = true;
6426 goto again;
6427}
6428
6429static inline void
6430btrfs_release_block_group(struct btrfs_block_group_cache *cache,
6431 int delalloc)
6432{
6433 if (delalloc)
6434 up_read(&cache->data_rwsem);
6435 btrfs_put_block_group(cache);
6436}
6437
6368/* 6438/*
6369 * walks the btree of allocated extents and find a hole of a given size. 6439 * walks the btree of allocated extents and find a hole of a given size.
6370 * The key ins is changed to record the hole: 6440 * The key ins is changed to record the hole:
@@ -6379,7 +6449,7 @@ enum btrfs_loop_type {
6379static noinline int find_free_extent(struct btrfs_root *orig_root, 6449static noinline int find_free_extent(struct btrfs_root *orig_root,
6380 u64 num_bytes, u64 empty_size, 6450 u64 num_bytes, u64 empty_size,
6381 u64 hint_byte, struct btrfs_key *ins, 6451 u64 hint_byte, struct btrfs_key *ins,
6382 u64 flags) 6452 u64 flags, int delalloc)
6383{ 6453{
6384 int ret = 0; 6454 int ret = 0;
6385 struct btrfs_root *root = orig_root->fs_info->extent_root; 6455 struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -6467,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6467 up_read(&space_info->groups_sem); 6537 up_read(&space_info->groups_sem);
6468 } else { 6538 } else {
6469 index = get_block_group_index(block_group); 6539 index = get_block_group_index(block_group);
6540 btrfs_lock_block_group(block_group, delalloc);
6470 goto have_block_group; 6541 goto have_block_group;
6471 } 6542 }
6472 } else if (block_group) { 6543 } else if (block_group) {
@@ -6481,7 +6552,7 @@ search:
6481 u64 offset; 6552 u64 offset;
6482 int cached; 6553 int cached;
6483 6554
6484 btrfs_get_block_group(block_group); 6555 btrfs_grab_block_group(block_group, delalloc);
6485 search_start = block_group->key.objectid; 6556 search_start = block_group->key.objectid;
6486 6557
6487 /* 6558 /*
@@ -6529,16 +6600,16 @@ have_block_group:
6529 * the refill lock keeps out other 6600 * the refill lock keeps out other
6530 * people trying to start a new cluster 6601 * people trying to start a new cluster
6531 */ 6602 */
6532 spin_lock(&last_ptr->refill_lock); 6603 used_block_group = btrfs_lock_cluster(block_group,
6533 used_block_group = last_ptr->block_group; 6604 last_ptr,
6534 if (used_block_group != block_group && 6605 delalloc);
6535 (!used_block_group || 6606 if (!used_block_group)
6536 used_block_group->ro ||
6537 !block_group_bits(used_block_group, flags)))
6538 goto refill_cluster; 6607 goto refill_cluster;
6539 6608
6540 if (used_block_group != block_group) 6609 if (used_block_group != block_group &&
6541 btrfs_get_block_group(used_block_group); 6610 (used_block_group->ro ||
6611 !block_group_bits(used_block_group, flags)))
6612 goto release_cluster;
6542 6613
6543 offset = btrfs_alloc_from_cluster(used_block_group, 6614 offset = btrfs_alloc_from_cluster(used_block_group,
6544 last_ptr, 6615 last_ptr,
@@ -6552,16 +6623,15 @@ have_block_group:
6552 used_block_group, 6623 used_block_group,
6553 search_start, num_bytes); 6624 search_start, num_bytes);
6554 if (used_block_group != block_group) { 6625 if (used_block_group != block_group) {
6555 btrfs_put_block_group(block_group); 6626 btrfs_release_block_group(block_group,
6627 delalloc);
6556 block_group = used_block_group; 6628 block_group = used_block_group;
6557 } 6629 }
6558 goto checks; 6630 goto checks;
6559 } 6631 }
6560 6632
6561 WARN_ON(last_ptr->block_group != used_block_group); 6633 WARN_ON(last_ptr->block_group != used_block_group);
6562 if (used_block_group != block_group) 6634release_cluster:
6563 btrfs_put_block_group(used_block_group);
6564refill_cluster:
6565 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6635 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
6566 * set up a new clusters, so lets just skip it 6636 * set up a new clusters, so lets just skip it
6567 * and let the allocator find whatever block 6637 * and let the allocator find whatever block
@@ -6578,8 +6648,10 @@ refill_cluster:
6578 * succeeding in the unclustered 6648 * succeeding in the unclustered
6579 * allocation. */ 6649 * allocation. */
6580 if (loop >= LOOP_NO_EMPTY_SIZE && 6650 if (loop >= LOOP_NO_EMPTY_SIZE &&
6581 last_ptr->block_group != block_group) { 6651 used_block_group != block_group) {
6582 spin_unlock(&last_ptr->refill_lock); 6652 spin_unlock(&last_ptr->refill_lock);
6653 btrfs_release_block_group(used_block_group,
6654 delalloc);
6583 goto unclustered_alloc; 6655 goto unclustered_alloc;
6584 } 6656 }
6585 6657
@@ -6589,6 +6661,10 @@ refill_cluster:
6589 */ 6661 */
6590 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
6591 6663
6664 if (used_block_group != block_group)
6665 btrfs_release_block_group(used_block_group,
6666 delalloc);
6667refill_cluster:
6592 if (loop >= LOOP_NO_EMPTY_SIZE) { 6668 if (loop >= LOOP_NO_EMPTY_SIZE) {
6593 spin_unlock(&last_ptr->refill_lock); 6669 spin_unlock(&last_ptr->refill_lock);
6594 goto unclustered_alloc; 6670 goto unclustered_alloc;
@@ -6696,7 +6772,7 @@ checks:
6696 BUG_ON(offset > search_start); 6772 BUG_ON(offset > search_start);
6697 6773
6698 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 6774 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
6699 alloc_type); 6775 alloc_type, delalloc);
6700 if (ret == -EAGAIN) { 6776 if (ret == -EAGAIN) {
6701 btrfs_add_free_space(block_group, offset, num_bytes); 6777 btrfs_add_free_space(block_group, offset, num_bytes);
6702 goto loop; 6778 goto loop;
@@ -6708,13 +6784,13 @@ checks:
6708 6784
6709 trace_btrfs_reserve_extent(orig_root, block_group, 6785 trace_btrfs_reserve_extent(orig_root, block_group,
6710 search_start, num_bytes); 6786 search_start, num_bytes);
6711 btrfs_put_block_group(block_group); 6787 btrfs_release_block_group(block_group, delalloc);
6712 break; 6788 break;
6713loop: 6789loop:
6714 failed_cluster_refill = false; 6790 failed_cluster_refill = false;
6715 failed_alloc = false; 6791 failed_alloc = false;
6716 BUG_ON(index != get_block_group_index(block_group)); 6792 BUG_ON(index != get_block_group_index(block_group));
6717 btrfs_put_block_group(block_group); 6793 btrfs_release_block_group(block_group, delalloc);
6718 } 6794 }
6719 up_read(&space_info->groups_sem); 6795 up_read(&space_info->groups_sem);
6720 6796
@@ -6827,7 +6903,7 @@ again:
6827int btrfs_reserve_extent(struct btrfs_root *root, 6903int btrfs_reserve_extent(struct btrfs_root *root,
6828 u64 num_bytes, u64 min_alloc_size, 6904 u64 num_bytes, u64 min_alloc_size,
6829 u64 empty_size, u64 hint_byte, 6905 u64 empty_size, u64 hint_byte,
6830 struct btrfs_key *ins, int is_data) 6906 struct btrfs_key *ins, int is_data, int delalloc)
6831{ 6907{
6832 bool final_tried = false; 6908 bool final_tried = false;
6833 u64 flags; 6909 u64 flags;
@@ -6837,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
6837again: 6913again:
6838 WARN_ON(num_bytes < root->sectorsize); 6914 WARN_ON(num_bytes < root->sectorsize);
6839 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 6915 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
6840 flags); 6916 flags, delalloc);
6841 6917
6842 if (ret == -ENOSPC) { 6918 if (ret == -ENOSPC) {
6843 if (!final_tried && ins->offset) { 6919 if (!final_tried && ins->offset) {
@@ -6862,7 +6938,8 @@ again:
6862} 6938}
6863 6939
6864static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6940static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6865 u64 start, u64 len, int pin) 6941 u64 start, u64 len,
6942 int pin, int delalloc)
6866{ 6943{
6867 struct btrfs_block_group_cache *cache; 6944 struct btrfs_block_group_cache *cache;
6868 int ret = 0; 6945 int ret = 0;
@@ -6881,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6881 pin_down_extent(root, cache, start, len, 1); 6958 pin_down_extent(root, cache, start, len, 1);
6882 else { 6959 else {
6883 btrfs_add_free_space(cache, start, len); 6960 btrfs_add_free_space(cache, start, len);
6884 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); 6961 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
6885 } 6962 }
6886 btrfs_put_block_group(cache); 6963 btrfs_put_block_group(cache);
6887 6964
@@ -6891,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6891} 6968}
6892 6969
6893int btrfs_free_reserved_extent(struct btrfs_root *root, 6970int btrfs_free_reserved_extent(struct btrfs_root *root,
6894 u64 start, u64 len) 6971 u64 start, u64 len, int delalloc)
6895{ 6972{
6896 return __btrfs_free_reserved_extent(root, start, len, 0); 6973 return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
6897} 6974}
6898 6975
6899int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6976int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6900 u64 start, u64 len) 6977 u64 start, u64 len)
6901{ 6978{
6902 return __btrfs_free_reserved_extent(root, start, len, 1); 6979 return __btrfs_free_reserved_extent(root, start, len, 1, 0);
6903} 6980}
6904 6981
6905static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6982static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -7114,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7114 return -EINVAL; 7191 return -EINVAL;
7115 7192
7116 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 7193 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
7117 RESERVE_ALLOC_NO_ACCOUNT); 7194 RESERVE_ALLOC_NO_ACCOUNT, 0);
7118 BUG_ON(ret); /* logic error */ 7195 BUG_ON(ret); /* logic error */
7119 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 7196 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
7120 0, owner, offset, ins, 1); 7197 0, owner, offset, ins, 1);
@@ -7256,7 +7333,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
7256 return ERR_CAST(block_rsv); 7333 return ERR_CAST(block_rsv);
7257 7334
7258 ret = btrfs_reserve_extent(root, blocksize, blocksize, 7335 ret = btrfs_reserve_extent(root, blocksize, blocksize,
7259 empty_size, hint, &ins, 0); 7336 empty_size, hint, &ins, 0, 0);
7260 if (ret) { 7337 if (ret) {
7261 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 7338 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
7262 return ERR_PTR(ret); 7339 return ERR_PTR(ret);
@@ -8659,6 +8736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8659 start); 8736 start);
8660 atomic_set(&cache->count, 1); 8737 atomic_set(&cache->count, 1);
8661 spin_lock_init(&cache->lock); 8738 spin_lock_init(&cache->lock);
8739 init_rwsem(&cache->data_rwsem);
8662 INIT_LIST_HEAD(&cache->list); 8740 INIT_LIST_HEAD(&cache->list);
8663 INIT_LIST_HEAD(&cache->cluster_list); 8741 INIT_LIST_HEAD(&cache->cluster_list);
8664 INIT_LIST_HEAD(&cache->new_bg_list); 8742 INIT_LIST_HEAD(&cache->new_bg_list);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f25a9092b946..a389820d158b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2354,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2354{ 2354{
2355 int uptodate = (err == 0); 2355 int uptodate = (err == 0);
2356 struct extent_io_tree *tree; 2356 struct extent_io_tree *tree;
2357 int ret; 2357 int ret = 0;
2358 2358
2359 tree = &BTRFS_I(page->mapping->host)->io_tree; 2359 tree = &BTRFS_I(page->mapping->host)->io_tree;
2360 2360
@@ -5068,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
5068 } 5068 }
5069} 5069}
5070 5070
5071int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
5072 unsigned long start,
5073 unsigned long len)
5074{
5075 size_t cur;
5076 size_t offset;
5077 struct page *page;
5078 char *kaddr;
5079 char __user *dst = (char __user *)dstv;
5080 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5081 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5082 int ret = 0;
5083
5084 WARN_ON(start > eb->len);
5085 WARN_ON(start + len > eb->start + eb->len);
5086
5087 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5088
5089 while (len > 0) {
5090 page = extent_buffer_page(eb, i);
5091
5092 cur = min(len, (PAGE_CACHE_SIZE - offset));
5093 kaddr = page_address(page);
5094 if (copy_to_user(dst, kaddr + offset, cur)) {
5095 ret = -EFAULT;
5096 break;
5097 }
5098
5099 dst += cur;
5100 len -= cur;
5101 offset = 0;
5102 i++;
5103 }
5104
5105 return ret;
5106}
5107
5071int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 5108int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
5072 unsigned long min_len, char **map, 5109 unsigned long min_len, char **map,
5073 unsigned long *map_start, 5110 unsigned long *map_start,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8b63f2d46518..ccc264e7bde1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -158,7 +158,6 @@ struct extent_buffer {
158 * to unlock 158 * to unlock
159 */ 159 */
160 wait_queue_head_t read_lock_wq; 160 wait_queue_head_t read_lock_wq;
161 wait_queue_head_t lock_wq;
162 struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; 161 struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
163#ifdef CONFIG_BTRFS_DEBUG 162#ifdef CONFIG_BTRFS_DEBUG
164 struct list_head leak_list; 163 struct list_head leak_list;
@@ -304,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
304void read_extent_buffer(struct extent_buffer *eb, void *dst, 303void read_extent_buffer(struct extent_buffer *eb, void *dst,
305 unsigned long start, 304 unsigned long start,
306 unsigned long len); 305 unsigned long len);
306int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
307 unsigned long start,
308 unsigned long len);
307void write_extent_buffer(struct extent_buffer *eb, const void *src, 309void write_extent_buffer(struct extent_buffer *eb, const void *src,
308 unsigned long start, unsigned long len); 310 unsigned long start, unsigned long len);
309void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 311void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1874aee69c86..225302b39afb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -75,6 +75,8 @@ void free_extent_map(struct extent_map *em)
75 if (atomic_dec_and_test(&em->refs)) { 75 if (atomic_dec_and_test(&em->refs)) {
76 WARN_ON(extent_map_in_tree(em)); 76 WARN_ON(extent_map_in_tree(em));
77 WARN_ON(!list_empty(&em->list)); 77 WARN_ON(!list_empty(&em->list));
78 if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
79 kfree(em->bdev);
78 kmem_cache_free(extent_map_cache, em); 80 kmem_cache_free(extent_map_cache, em);
79 } 81 }
80} 82}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e7fd8a56a140..b2991fd8583e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -15,6 +15,7 @@
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ 16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
17#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ 17#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
18#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
18 19
19struct extent_map { 20struct extent_map {
20 struct rb_node rb_node; 21 struct rb_node rb_node;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e472441feb5d..1f2b99cb55ea 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -448,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
448 write_bytes -= copied; 448 write_bytes -= copied;
449 total_copied += copied; 449 total_copied += copied;
450 450
451 /* Return to btrfs_file_aio_write to fault page */ 451 /* Return to btrfs_file_write_iter to fault page */
452 if (unlikely(copied == 0)) 452 if (unlikely(copied == 0))
453 break; 453 break;
454 454
@@ -1675,27 +1675,22 @@ again:
1675} 1675}
1676 1676
1677static ssize_t __btrfs_direct_write(struct kiocb *iocb, 1677static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1678 const struct iovec *iov, 1678 struct iov_iter *from,
1679 unsigned long nr_segs, loff_t pos, 1679 loff_t pos)
1680 size_t count, size_t ocount)
1681{ 1680{
1682 struct file *file = iocb->ki_filp; 1681 struct file *file = iocb->ki_filp;
1683 struct iov_iter i;
1684 ssize_t written; 1682 ssize_t written;
1685 ssize_t written_buffered; 1683 ssize_t written_buffered;
1686 loff_t endbyte; 1684 loff_t endbyte;
1687 int err; 1685 int err;
1688 1686
1689 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 1687 written = generic_file_direct_write(iocb, from, pos);
1690 count, ocount);
1691 1688
1692 if (written < 0 || written == count) 1689 if (written < 0 || !iov_iter_count(from))
1693 return written; 1690 return written;
1694 1691
1695 pos += written; 1692 pos += written;
1696 count -= written; 1693 written_buffered = __btrfs_buffered_write(file, from, pos);
1697 iov_iter_init(&i, iov, nr_segs, count, written);
1698 written_buffered = __btrfs_buffered_write(file, &i, pos);
1699 if (written_buffered < 0) { 1694 if (written_buffered < 0) {
1700 err = written_buffered; 1695 err = written_buffered;
1701 goto out; 1696 goto out;
@@ -1730,9 +1725,8 @@ static void update_time_for_write(struct inode *inode)
1730 inode_inc_iversion(inode); 1725 inode_inc_iversion(inode);
1731} 1726}
1732 1727
1733static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1728static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1734 const struct iovec *iov, 1729 struct iov_iter *from)
1735 unsigned long nr_segs, loff_t pos)
1736{ 1730{
1737 struct file *file = iocb->ki_filp; 1731 struct file *file = iocb->ki_filp;
1738 struct inode *inode = file_inode(file); 1732 struct inode *inode = file_inode(file);
@@ -1741,18 +1735,12 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1741 u64 end_pos; 1735 u64 end_pos;
1742 ssize_t num_written = 0; 1736 ssize_t num_written = 0;
1743 ssize_t err = 0; 1737 ssize_t err = 0;
1744 size_t count, ocount; 1738 size_t count = iov_iter_count(from);
1745 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); 1739 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1740 loff_t pos = iocb->ki_pos;
1746 1741
1747 mutex_lock(&inode->i_mutex); 1742 mutex_lock(&inode->i_mutex);
1748 1743
1749 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1750 if (err) {
1751 mutex_unlock(&inode->i_mutex);
1752 goto out;
1753 }
1754 count = ocount;
1755
1756 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1744 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1757 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1745 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1758 if (err) { 1746 if (err) {
@@ -1765,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1765 goto out; 1753 goto out;
1766 } 1754 }
1767 1755
1756 iov_iter_truncate(from, count);
1757
1768 err = file_remove_suid(file); 1758 err = file_remove_suid(file);
1769 if (err) { 1759 if (err) {
1770 mutex_unlock(&inode->i_mutex); 1760 mutex_unlock(&inode->i_mutex);
@@ -1806,14 +1796,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1806 atomic_inc(&BTRFS_I(inode)->sync_writers); 1796 atomic_inc(&BTRFS_I(inode)->sync_writers);
1807 1797
1808 if (unlikely(file->f_flags & O_DIRECT)) { 1798 if (unlikely(file->f_flags & O_DIRECT)) {
1809 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1799 num_written = __btrfs_direct_write(iocb, from, pos);
1810 pos, count, ocount);
1811 } else { 1800 } else {
1812 struct iov_iter i; 1801 num_written = __btrfs_buffered_write(file, from, pos);
1813
1814 iov_iter_init(&i, iov, nr_segs, count, num_written);
1815
1816 num_written = __btrfs_buffered_write(file, &i, pos);
1817 if (num_written > 0) 1802 if (num_written > 0)
1818 iocb->ki_pos = pos + num_written; 1803 iocb->ki_pos = pos + num_written;
1819 } 1804 }
@@ -2740,11 +2725,11 @@ out:
2740 2725
2741const struct file_operations btrfs_file_operations = { 2726const struct file_operations btrfs_file_operations = {
2742 .llseek = btrfs_file_llseek, 2727 .llseek = btrfs_file_llseek,
2743 .read = do_sync_read, 2728 .read = new_sync_read,
2744 .write = do_sync_write, 2729 .write = new_sync_write,
2745 .aio_read = generic_file_aio_read, 2730 .read_iter = generic_file_read_iter,
2746 .splice_read = generic_file_splice_read, 2731 .splice_read = generic_file_splice_read,
2747 .aio_write = btrfs_file_aio_write, 2732 .write_iter = btrfs_file_write_iter,
2748 .mmap = btrfs_file_mmap, 2733 .mmap = btrfs_file_mmap,
2749 .open = generic_file_open, 2734 .open = generic_file_open,
2750 .release = btrfs_release_file, 2735 .release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 372b05ff1943..2b0a627cb5f9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -274,18 +274,32 @@ struct io_ctl {
274}; 274};
275 275
276static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, 276static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
277 struct btrfs_root *root) 277 struct btrfs_root *root, int write)
278{ 278{
279 int num_pages;
280 int check_crcs = 0;
281
282 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
283 PAGE_CACHE_SHIFT;
284
285 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
286 check_crcs = 1;
287
288 /* Make sure we can fit our crcs into the first page */
289 if (write && check_crcs &&
290 (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
291 return -ENOSPC;
292
279 memset(io_ctl, 0, sizeof(struct io_ctl)); 293 memset(io_ctl, 0, sizeof(struct io_ctl));
280 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 294
281 PAGE_CACHE_SHIFT; 295 io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
282 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
283 GFP_NOFS);
284 if (!io_ctl->pages) 296 if (!io_ctl->pages)
285 return -ENOMEM; 297 return -ENOMEM;
298
299 io_ctl->num_pages = num_pages;
286 io_ctl->root = root; 300 io_ctl->root = root;
287 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) 301 io_ctl->check_crcs = check_crcs;
288 io_ctl->check_crcs = 1; 302
289 return 0; 303 return 0;
290} 304}
291 305
@@ -666,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
666 generation = btrfs_free_space_generation(leaf, header); 680 generation = btrfs_free_space_generation(leaf, header);
667 btrfs_release_path(path); 681 btrfs_release_path(path);
668 682
683 if (!BTRFS_I(inode)->generation) {
684 btrfs_info(root->fs_info,
685 "The free space cache file (%llu) is invalid. skip it\n",
686 offset);
687 return 0;
688 }
689
669 if (BTRFS_I(inode)->generation != generation) { 690 if (BTRFS_I(inode)->generation != generation) {
670 btrfs_err(root->fs_info, 691 btrfs_err(root->fs_info,
671 "free space inode generation (%llu) " 692 "free space inode generation (%llu) "
@@ -677,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
677 if (!num_entries) 698 if (!num_entries)
678 return 0; 699 return 0;
679 700
680 ret = io_ctl_init(&io_ctl, inode, root); 701 ret = io_ctl_init(&io_ctl, inode, root, 0);
681 if (ret) 702 if (ret)
682 return ret; 703 return ret;
683 704
@@ -957,19 +978,18 @@ fail:
957} 978}
958 979
959static noinline_for_stack int 980static noinline_for_stack int
960add_ioctl_entries(struct btrfs_root *root, 981write_pinned_extent_entries(struct btrfs_root *root,
961 struct inode *inode, 982 struct btrfs_block_group_cache *block_group,
962 struct btrfs_block_group_cache *block_group, 983 struct io_ctl *io_ctl,
963 struct io_ctl *io_ctl, 984 int *entries)
964 struct extent_state **cached_state,
965 struct list_head *bitmap_list,
966 int *entries)
967{ 985{
968 u64 start, extent_start, extent_end, len; 986 u64 start, extent_start, extent_end, len;
969 struct list_head *pos, *n;
970 struct extent_io_tree *unpin = NULL; 987 struct extent_io_tree *unpin = NULL;
971 int ret; 988 int ret;
972 989
990 if (!block_group)
991 return 0;
992
973 /* 993 /*
974 * We want to add any pinned extents to our free space cache 994 * We want to add any pinned extents to our free space cache
975 * so we don't leak the space 995 * so we don't leak the space
@@ -979,23 +999,19 @@ add_ioctl_entries(struct btrfs_root *root,
979 */ 999 */
980 unpin = root->fs_info->pinned_extents; 1000 unpin = root->fs_info->pinned_extents;
981 1001
982 if (block_group) 1002 start = block_group->key.objectid;
983 start = block_group->key.objectid;
984 1003
985 while (block_group && (start < block_group->key.objectid + 1004 while (start < block_group->key.objectid + block_group->key.offset) {
986 block_group->key.offset)) {
987 ret = find_first_extent_bit(unpin, start, 1005 ret = find_first_extent_bit(unpin, start,
988 &extent_start, &extent_end, 1006 &extent_start, &extent_end,
989 EXTENT_DIRTY, NULL); 1007 EXTENT_DIRTY, NULL);
990 if (ret) { 1008 if (ret)
991 ret = 0; 1009 return 0;
992 break;
993 }
994 1010
995 /* This pinned extent is out of our range */ 1011 /* This pinned extent is out of our range */
996 if (extent_start >= block_group->key.objectid + 1012 if (extent_start >= block_group->key.objectid +
997 block_group->key.offset) 1013 block_group->key.offset)
998 break; 1014 return 0;
999 1015
1000 extent_start = max(extent_start, start); 1016 extent_start = max(extent_start, start);
1001 extent_end = min(block_group->key.objectid + 1017 extent_end = min(block_group->key.objectid +
@@ -1005,11 +1021,20 @@ add_ioctl_entries(struct btrfs_root *root,
1005 *entries += 1; 1021 *entries += 1;
1006 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1022 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1007 if (ret) 1023 if (ret)
1008 goto out_nospc; 1024 return -ENOSPC;
1009 1025
1010 start = extent_end; 1026 start = extent_end;
1011 } 1027 }
1012 1028
1029 return 0;
1030}
1031
1032static noinline_for_stack int
1033write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
1034{
1035 struct list_head *pos, *n;
1036 int ret;
1037
1013 /* Write out the bitmaps */ 1038 /* Write out the bitmaps */
1014 list_for_each_safe(pos, n, bitmap_list) { 1039 list_for_each_safe(pos, n, bitmap_list) {
1015 struct btrfs_free_space *entry = 1040 struct btrfs_free_space *entry =
@@ -1017,36 +1042,24 @@ add_ioctl_entries(struct btrfs_root *root,
1017 1042
1018 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1043 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1019 if (ret) 1044 if (ret)
1020 goto out_nospc; 1045 return -ENOSPC;
1021 list_del_init(&entry->list); 1046 list_del_init(&entry->list);
1022 } 1047 }
1023 1048
1024 /* Zero out the rest of the pages just to make sure */ 1049 return 0;
1025 io_ctl_zero_remaining_pages(io_ctl); 1050}
1026
1027 ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
1028 0, i_size_read(inode), cached_state);
1029 io_ctl_drop_pages(io_ctl);
1030 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1031 i_size_read(inode) - 1, cached_state, GFP_NOFS);
1032 1051
1033 if (ret) 1052static int flush_dirty_cache(struct inode *inode)
1034 goto fail; 1053{
1054 int ret;
1035 1055
1036 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1056 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1037 if (ret) { 1057 if (ret)
1038 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1058 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1039 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 1059 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1040 GFP_NOFS); 1060 GFP_NOFS);
1041 goto fail;
1042 }
1043 return 0;
1044 1061
1045fail: 1062 return ret;
1046 return -1;
1047
1048out_nospc:
1049 return -ENOSPC;
1050} 1063}
1051 1064
1052static void noinline_for_stack 1065static void noinline_for_stack
@@ -1056,6 +1069,7 @@ cleanup_write_cache_enospc(struct inode *inode,
1056 struct list_head *bitmap_list) 1069 struct list_head *bitmap_list)
1057{ 1070{
1058 struct list_head *pos, *n; 1071 struct list_head *pos, *n;
1072
1059 list_for_each_safe(pos, n, bitmap_list) { 1073 list_for_each_safe(pos, n, bitmap_list) {
1060 struct btrfs_free_space *entry = 1074 struct btrfs_free_space *entry =
1061 list_entry(pos, struct btrfs_free_space, list); 1075 list_entry(pos, struct btrfs_free_space, list);
@@ -1088,64 +1102,104 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1088{ 1102{
1089 struct extent_state *cached_state = NULL; 1103 struct extent_state *cached_state = NULL;
1090 struct io_ctl io_ctl; 1104 struct io_ctl io_ctl;
1091 struct list_head bitmap_list; 1105 LIST_HEAD(bitmap_list);
1092 int entries = 0; 1106 int entries = 0;
1093 int bitmaps = 0; 1107 int bitmaps = 0;
1094 int ret; 1108 int ret;
1095 int err = -1;
1096
1097 INIT_LIST_HEAD(&bitmap_list);
1098 1109
1099 if (!i_size_read(inode)) 1110 if (!i_size_read(inode))
1100 return -1; 1111 return -1;
1101 1112
1102 ret = io_ctl_init(&io_ctl, inode, root); 1113 ret = io_ctl_init(&io_ctl, inode, root, 1);
1103 if (ret) 1114 if (ret)
1104 return -1; 1115 return -1;
1105 1116
1117 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1118 down_write(&block_group->data_rwsem);
1119 spin_lock(&block_group->lock);
1120 if (block_group->delalloc_bytes) {
1121 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1122 spin_unlock(&block_group->lock);
1123 up_write(&block_group->data_rwsem);
1124 BTRFS_I(inode)->generation = 0;
1125 ret = 0;
1126 goto out;
1127 }
1128 spin_unlock(&block_group->lock);
1129 }
1130
1106 /* Lock all pages first so we can lock the extent safely. */ 1131 /* Lock all pages first so we can lock the extent safely. */
1107 io_ctl_prepare_pages(&io_ctl, inode, 0); 1132 io_ctl_prepare_pages(&io_ctl, inode, 0);
1108 1133
1109 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1134 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1110 0, &cached_state); 1135 0, &cached_state);
1111 1136
1112
1113 /* Make sure we can fit our crcs into the first page */
1114 if (io_ctl.check_crcs &&
1115 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
1116 goto out_nospc;
1117
1118 io_ctl_set_generation(&io_ctl, trans->transid); 1137 io_ctl_set_generation(&io_ctl, trans->transid);
1119 1138
1139 /* Write out the extent entries in the free space cache */
1120 ret = write_cache_extent_entries(&io_ctl, ctl, 1140 ret = write_cache_extent_entries(&io_ctl, ctl,
1121 block_group, &entries, &bitmaps, 1141 block_group, &entries, &bitmaps,
1122 &bitmap_list); 1142 &bitmap_list);
1123 if (ret) 1143 if (ret)
1124 goto out_nospc; 1144 goto out_nospc;
1125 1145
1126 ret = add_ioctl_entries(root, inode, block_group, &io_ctl, 1146 /*
1127 &cached_state, &bitmap_list, &entries); 1147 * Some spaces that are freed in the current transaction are pinned,
1148 * they will be added into free space cache after the transaction is
1149 * committed, we shouldn't lose them.
1150 */
1151 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
1152 if (ret)
1153 goto out_nospc;
1128 1154
1129 if (ret == -ENOSPC) 1155 /* At last, we write out all the bitmaps. */
1156 ret = write_bitmap_entries(&io_ctl, &bitmap_list);
1157 if (ret)
1130 goto out_nospc; 1158 goto out_nospc;
1131 else if (ret) 1159
1160 /* Zero out the rest of the pages just to make sure */
1161 io_ctl_zero_remaining_pages(&io_ctl);
1162
1163 /* Everything is written out, now we dirty the pages in the file. */
1164 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
1165 0, i_size_read(inode), &cached_state);
1166 if (ret)
1167 goto out_nospc;
1168
1169 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1170 up_write(&block_group->data_rwsem);
1171 /*
1172 * Release the pages and unlock the extent, we will flush
1173 * them out later
1174 */
1175 io_ctl_drop_pages(&io_ctl);
1176
1177 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1178 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1179
1180 /* Flush the dirty pages in the cache file. */
1181 ret = flush_dirty_cache(inode);
1182 if (ret)
1132 goto out; 1183 goto out;
1133 1184
1134 err = update_cache_item(trans, root, inode, path, offset, 1185 /* Update the cache item to tell everyone this cache file is valid. */
1186 ret = update_cache_item(trans, root, inode, path, offset,
1135 entries, bitmaps); 1187 entries, bitmaps);
1136
1137out: 1188out:
1138 io_ctl_free(&io_ctl); 1189 io_ctl_free(&io_ctl);
1139 if (err) { 1190 if (ret) {
1140 invalidate_inode_pages2(inode->i_mapping); 1191 invalidate_inode_pages2(inode->i_mapping);
1141 BTRFS_I(inode)->generation = 0; 1192 BTRFS_I(inode)->generation = 0;
1142 } 1193 }
1143 btrfs_update_inode(trans, root, inode); 1194 btrfs_update_inode(trans, root, inode);
1144 return err; 1195 return ret;
1145 1196
1146out_nospc: 1197out_nospc:
1147
1148 cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); 1198 cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
1199
1200 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1201 up_write(&block_group->data_rwsem);
1202
1149 goto out; 1203 goto out;
1150} 1204}
1151 1205
@@ -1165,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
1165 spin_unlock(&block_group->lock); 1219 spin_unlock(&block_group->lock);
1166 return 0; 1220 return 0;
1167 } 1221 }
1222
1223 if (block_group->delalloc_bytes) {
1224 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1225 spin_unlock(&block_group->lock);
1226 return 0;
1227 }
1168 spin_unlock(&block_group->lock); 1228 spin_unlock(&block_group->lock);
1169 1229
1170 inode = lookup_free_space_inode(root, block_group, path); 1230 inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7fa5f7fd7bc7..3668048e16f8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -693,7 +693,7 @@ retry:
693 ret = btrfs_reserve_extent(root, 693 ret = btrfs_reserve_extent(root,
694 async_extent->compressed_size, 694 async_extent->compressed_size,
695 async_extent->compressed_size, 695 async_extent->compressed_size,
696 0, alloc_hint, &ins, 1); 696 0, alloc_hint, &ins, 1, 1);
697 if (ret) { 697 if (ret) {
698 int i; 698 int i;
699 699
@@ -794,7 +794,7 @@ retry:
794out: 794out:
795 return ret; 795 return ret;
796out_free_reserve: 796out_free_reserve:
797 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 797 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
798out_free: 798out_free:
799 extent_clear_unlock_delalloc(inode, async_extent->start, 799 extent_clear_unlock_delalloc(inode, async_extent->start,
800 async_extent->start + 800 async_extent->start +
@@ -917,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode,
917 cur_alloc_size = disk_num_bytes; 917 cur_alloc_size = disk_num_bytes;
918 ret = btrfs_reserve_extent(root, cur_alloc_size, 918 ret = btrfs_reserve_extent(root, cur_alloc_size,
919 root->sectorsize, 0, alloc_hint, 919 root->sectorsize, 0, alloc_hint,
920 &ins, 1); 920 &ins, 1, 1);
921 if (ret < 0) 921 if (ret < 0)
922 goto out_unlock; 922 goto out_unlock;
923 923
@@ -995,7 +995,7 @@ out:
995 return ret; 995 return ret;
996 996
997out_reserve: 997out_reserve:
998 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 998 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
999out_unlock: 999out_unlock:
1000 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1000 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1001 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 1001 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
@@ -2599,6 +2599,21 @@ out_kfree:
2599 return NULL; 2599 return NULL;
2600} 2600}
2601 2601
2602static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2603 u64 start, u64 len)
2604{
2605 struct btrfs_block_group_cache *cache;
2606
2607 cache = btrfs_lookup_block_group(root->fs_info, start);
2608 ASSERT(cache);
2609
2610 spin_lock(&cache->lock);
2611 cache->delalloc_bytes -= len;
2612 spin_unlock(&cache->lock);
2613
2614 btrfs_put_block_group(cache);
2615}
2616
2602/* as ordered data IO finishes, this gets called so we can finish 2617/* as ordered data IO finishes, this gets called so we can finish
2603 * an ordered extent if the range of bytes in the file it covers are 2618 * an ordered extent if the range of bytes in the file it covers are
2604 * fully written. 2619 * fully written.
@@ -2698,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2698 logical_len, logical_len, 2713 logical_len, logical_len,
2699 compress_type, 0, 0, 2714 compress_type, 0, 0,
2700 BTRFS_FILE_EXTENT_REG); 2715 BTRFS_FILE_EXTENT_REG);
2716 if (!ret)
2717 btrfs_release_delalloc_bytes(root,
2718 ordered_extent->start,
2719 ordered_extent->disk_len);
2701 } 2720 }
2702 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 2721 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2703 ordered_extent->file_offset, ordered_extent->len, 2722 ordered_extent->file_offset, ordered_extent->len,
@@ -2750,7 +2769,7 @@ out:
2750 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 2769 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2751 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) 2770 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2752 btrfs_free_reserved_extent(root, ordered_extent->start, 2771 btrfs_free_reserved_extent(root, ordered_extent->start,
2753 ordered_extent->disk_len); 2772 ordered_extent->disk_len, 1);
2754 } 2773 }
2755 2774
2756 2775
@@ -6535,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6535 6554
6536 alloc_hint = get_extent_allocation_hint(inode, start, len); 6555 alloc_hint = get_extent_allocation_hint(inode, start, len);
6537 ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, 6556 ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
6538 alloc_hint, &ins, 1); 6557 alloc_hint, &ins, 1, 1);
6539 if (ret) 6558 if (ret)
6540 return ERR_PTR(ret); 6559 return ERR_PTR(ret);
6541 6560
6542 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, 6561 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
6543 ins.offset, ins.offset, ins.offset, 0); 6562 ins.offset, ins.offset, ins.offset, 0);
6544 if (IS_ERR(em)) { 6563 if (IS_ERR(em)) {
6545 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 6564 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
6546 return em; 6565 return em;
6547 } 6566 }
6548 6567
6549 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 6568 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
6550 ins.offset, ins.offset, 0); 6569 ins.offset, ins.offset, 0);
6551 if (ret) { 6570 if (ret) {
6552 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 6571 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
6553 free_extent_map(em); 6572 free_extent_map(em);
6554 return ERR_PTR(ret); 6573 return ERR_PTR(ret);
6555 } 6574 }
@@ -7437,7 +7456,7 @@ free_ordered:
7437 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 7456 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
7438 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 7457 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
7439 btrfs_free_reserved_extent(root, ordered->start, 7458 btrfs_free_reserved_extent(root, ordered->start,
7440 ordered->disk_len); 7459 ordered->disk_len, 1);
7441 btrfs_put_ordered_extent(ordered); 7460 btrfs_put_ordered_extent(ordered);
7442 btrfs_put_ordered_extent(ordered); 7461 btrfs_put_ordered_extent(ordered);
7443 } 7462 }
@@ -7445,39 +7464,30 @@ free_ordered:
7445} 7464}
7446 7465
7447static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 7466static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
7448 const struct iovec *iov, loff_t offset, 7467 const struct iov_iter *iter, loff_t offset)
7449 unsigned long nr_segs)
7450{ 7468{
7451 int seg; 7469 int seg;
7452 int i; 7470 int i;
7453 size_t size;
7454 unsigned long addr;
7455 unsigned blocksize_mask = root->sectorsize - 1; 7471 unsigned blocksize_mask = root->sectorsize - 1;
7456 ssize_t retval = -EINVAL; 7472 ssize_t retval = -EINVAL;
7457 loff_t end = offset;
7458 7473
7459 if (offset & blocksize_mask) 7474 if (offset & blocksize_mask)
7460 goto out; 7475 goto out;
7461 7476
7462 /* Check the memory alignment. Blocks cannot straddle pages */ 7477 if (iov_iter_alignment(iter) & blocksize_mask)
7463 for (seg = 0; seg < nr_segs; seg++) { 7478 goto out;
7464 addr = (unsigned long)iov[seg].iov_base;
7465 size = iov[seg].iov_len;
7466 end += size;
7467 if ((addr & blocksize_mask) || (size & blocksize_mask))
7468 goto out;
7469
7470 /* If this is a write we don't need to check anymore */
7471 if (rw & WRITE)
7472 continue;
7473 7479
7474 /* 7480 /* If this is a write we don't need to check anymore */
7475 * Check to make sure we don't have duplicate iov_base's in this 7481 if (rw & WRITE)
7476 * iovec, if so return EINVAL, otherwise we'll get csum errors 7482 return 0;
7477 * when reading back. 7483 /*
7478 */ 7484 * Check to make sure we don't have duplicate iov_base's in this
7479 for (i = seg + 1; i < nr_segs; i++) { 7485 * iovec, if so return EINVAL, otherwise we'll get csum errors
7480 if (iov[seg].iov_base == iov[i].iov_base) 7486 * when reading back.
7487 */
7488 for (seg = 0; seg < iter->nr_segs; seg++) {
7489 for (i = seg + 1; i < iter->nr_segs; i++) {
7490 if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
7481 goto out; 7491 goto out;
7482 } 7492 }
7483 } 7493 }
@@ -7487,8 +7497,7 @@ out:
7487} 7497}
7488 7498
7489static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 7499static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7490 const struct iovec *iov, loff_t offset, 7500 struct iov_iter *iter, loff_t offset)
7491 unsigned long nr_segs)
7492{ 7501{
7493 struct file *file = iocb->ki_filp; 7502 struct file *file = iocb->ki_filp;
7494 struct inode *inode = file->f_mapping->host; 7503 struct inode *inode = file->f_mapping->host;
@@ -7498,8 +7507,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7498 bool relock = false; 7507 bool relock = false;
7499 ssize_t ret; 7508 ssize_t ret;
7500 7509
7501 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 7510 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
7502 offset, nr_segs))
7503 return 0; 7511 return 0;
7504 7512
7505 atomic_inc(&inode->i_dio_count); 7513 atomic_inc(&inode->i_dio_count);
@@ -7511,7 +7519,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7511 * we need to flush the dirty pages again to make absolutely sure 7519 * we need to flush the dirty pages again to make absolutely sure
7512 * that any outstanding dirty pages are on disk. 7520 * that any outstanding dirty pages are on disk.
7513 */ 7521 */
7514 count = iov_length(iov, nr_segs); 7522 count = iov_iter_count(iter);
7515 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7523 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7516 &BTRFS_I(inode)->runtime_flags)) 7524 &BTRFS_I(inode)->runtime_flags))
7517 filemap_fdatawrite_range(inode->i_mapping, offset, count); 7525 filemap_fdatawrite_range(inode->i_mapping, offset, count);
@@ -7538,7 +7546,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7538 7546
7539 ret = __blockdev_direct_IO(rw, iocb, inode, 7547 ret = __blockdev_direct_IO(rw, iocb, inode,
7540 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 7548 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7541 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 7549 iter, offset, btrfs_get_blocks_direct, NULL,
7542 btrfs_submit_direct, flags); 7550 btrfs_submit_direct, flags);
7543 if (rw & WRITE) { 7551 if (rw & WRITE) {
7544 if (ret < 0 && ret != -EIOCBQUEUED) 7552 if (ret < 0 && ret != -EIOCBQUEUED)
@@ -8819,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8819 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); 8827 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
8820 cur_bytes = max(cur_bytes, min_size); 8828 cur_bytes = max(cur_bytes, min_size);
8821 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, 8829 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
8822 *alloc_hint, &ins, 1); 8830 *alloc_hint, &ins, 1, 0);
8823 if (ret) { 8831 if (ret) {
8824 if (own_trans) 8832 if (own_trans)
8825 btrfs_end_transaction(trans, root); 8833 btrfs_end_transaction(trans, root);
@@ -8833,7 +8841,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8833 BTRFS_FILE_EXTENT_PREALLOC); 8841 BTRFS_FILE_EXTENT_PREALLOC);
8834 if (ret) { 8842 if (ret) {
8835 btrfs_free_reserved_extent(root, ins.objectid, 8843 btrfs_free_reserved_extent(root, ins.objectid,
8836 ins.offset); 8844 ins.offset, 0);
8837 btrfs_abort_transaction(trans, root, ret); 8845 btrfs_abort_transaction(trans, root, ret);
8838 if (own_trans) 8846 if (own_trans)
8839 btrfs_end_transaction(trans, root); 8847 btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 82c18ba12e3f..47aceb494d1d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
136void btrfs_update_iflags(struct inode *inode) 136void btrfs_update_iflags(struct inode *inode)
137{ 137{
138 struct btrfs_inode *ip = BTRFS_I(inode); 138 struct btrfs_inode *ip = BTRFS_I(inode);
139 139 unsigned int new_fl = 0;
140 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
141 140
142 if (ip->flags & BTRFS_INODE_SYNC) 141 if (ip->flags & BTRFS_INODE_SYNC)
143 inode->i_flags |= S_SYNC; 142 new_fl |= S_SYNC;
144 if (ip->flags & BTRFS_INODE_IMMUTABLE) 143 if (ip->flags & BTRFS_INODE_IMMUTABLE)
145 inode->i_flags |= S_IMMUTABLE; 144 new_fl |= S_IMMUTABLE;
146 if (ip->flags & BTRFS_INODE_APPEND) 145 if (ip->flags & BTRFS_INODE_APPEND)
147 inode->i_flags |= S_APPEND; 146 new_fl |= S_APPEND;
148 if (ip->flags & BTRFS_INODE_NOATIME) 147 if (ip->flags & BTRFS_INODE_NOATIME)
149 inode->i_flags |= S_NOATIME; 148 new_fl |= S_NOATIME;
150 if (ip->flags & BTRFS_INODE_DIRSYNC) 149 if (ip->flags & BTRFS_INODE_DIRSYNC)
151 inode->i_flags |= S_DIRSYNC; 150 new_fl |= S_DIRSYNC;
151
152 set_mask_bits(&inode->i_flags,
153 S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
154 new_fl);
152} 155}
153 156
154/* 157/*
@@ -1957,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root,
1957 struct btrfs_path *path, 1960 struct btrfs_path *path,
1958 struct btrfs_key *key, 1961 struct btrfs_key *key,
1959 struct btrfs_ioctl_search_key *sk, 1962 struct btrfs_ioctl_search_key *sk,
1960 char *buf, 1963 size_t *buf_size,
1964 char __user *ubuf,
1961 unsigned long *sk_offset, 1965 unsigned long *sk_offset,
1962 int *num_found) 1966 int *num_found)
1963{ 1967{
@@ -1989,13 +1993,25 @@ static noinline int copy_to_sk(struct btrfs_root *root,
1989 if (!key_in_sk(key, sk)) 1993 if (!key_in_sk(key, sk))
1990 continue; 1994 continue;
1991 1995
1992 if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE) 1996 if (sizeof(sh) + item_len > *buf_size) {
1997 if (*num_found) {
1998 ret = 1;
1999 goto out;
2000 }
2001
2002 /*
2003 * return one empty item back for v1, which does not
2004 * handle -EOVERFLOW
2005 */
2006
2007 *buf_size = sizeof(sh) + item_len;
1993 item_len = 0; 2008 item_len = 0;
2009 ret = -EOVERFLOW;
2010 }
1994 2011
1995 if (sizeof(sh) + item_len + *sk_offset > 2012 if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
1996 BTRFS_SEARCH_ARGS_BUFSIZE) {
1997 ret = 1; 2013 ret = 1;
1998 goto overflow; 2014 goto out;
1999 } 2015 }
2000 2016
2001 sh.objectid = key->objectid; 2017 sh.objectid = key->objectid;
@@ -2005,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root,
2005 sh.transid = found_transid; 2021 sh.transid = found_transid;
2006 2022
2007 /* copy search result header */ 2023 /* copy search result header */
2008 memcpy(buf + *sk_offset, &sh, sizeof(sh)); 2024 if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
2025 ret = -EFAULT;
2026 goto out;
2027 }
2028
2009 *sk_offset += sizeof(sh); 2029 *sk_offset += sizeof(sh);
2010 2030
2011 if (item_len) { 2031 if (item_len) {
2012 char *p = buf + *sk_offset; 2032 char __user *up = ubuf + *sk_offset;
2013 /* copy the item */ 2033 /* copy the item */
2014 read_extent_buffer(leaf, p, 2034 if (read_extent_buffer_to_user(leaf, up,
2015 item_off, item_len); 2035 item_off, item_len)) {
2036 ret = -EFAULT;
2037 goto out;
2038 }
2039
2016 *sk_offset += item_len; 2040 *sk_offset += item_len;
2017 } 2041 }
2018 (*num_found)++; 2042 (*num_found)++;
2019 2043
2020 if (*num_found >= sk->nr_items) 2044 if (ret) /* -EOVERFLOW from above */
2021 break; 2045 goto out;
2046
2047 if (*num_found >= sk->nr_items) {
2048 ret = 1;
2049 goto out;
2050 }
2022 } 2051 }
2023advance_key: 2052advance_key:
2024 ret = 0; 2053 ret = 0;
@@ -2033,22 +2062,37 @@ advance_key:
2033 key->objectid++; 2062 key->objectid++;
2034 } else 2063 } else
2035 ret = 1; 2064 ret = 1;
2036overflow: 2065out:
2066 /*
2067 * 0: all items from this leaf copied, continue with next
2068 * 1: * more items can be copied, but unused buffer is too small
2069 * * all items were found
2070 * Either way, it will stops the loop which iterates to the next
2071 * leaf
2072 * -EOVERFLOW: item was to large for buffer
2073 * -EFAULT: could not copy extent buffer back to userspace
2074 */
2037 return ret; 2075 return ret;
2038} 2076}
2039 2077
2040static noinline int search_ioctl(struct inode *inode, 2078static noinline int search_ioctl(struct inode *inode,
2041 struct btrfs_ioctl_search_args *args) 2079 struct btrfs_ioctl_search_key *sk,
2080 size_t *buf_size,
2081 char __user *ubuf)
2042{ 2082{
2043 struct btrfs_root *root; 2083 struct btrfs_root *root;
2044 struct btrfs_key key; 2084 struct btrfs_key key;
2045 struct btrfs_path *path; 2085 struct btrfs_path *path;
2046 struct btrfs_ioctl_search_key *sk = &args->key;
2047 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; 2086 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
2048 int ret; 2087 int ret;
2049 int num_found = 0; 2088 int num_found = 0;
2050 unsigned long sk_offset = 0; 2089 unsigned long sk_offset = 0;
2051 2090
2091 if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
2092 *buf_size = sizeof(struct btrfs_ioctl_search_header);
2093 return -EOVERFLOW;
2094 }
2095
2052 path = btrfs_alloc_path(); 2096 path = btrfs_alloc_path();
2053 if (!path) 2097 if (!path)
2054 return -ENOMEM; 2098 return -ENOMEM;
@@ -2082,14 +2126,15 @@ static noinline int search_ioctl(struct inode *inode,
2082 ret = 0; 2126 ret = 0;
2083 goto err; 2127 goto err;
2084 } 2128 }
2085 ret = copy_to_sk(root, path, &key, sk, args->buf, 2129 ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
2086 &sk_offset, &num_found); 2130 &sk_offset, &num_found);
2087 btrfs_release_path(path); 2131 btrfs_release_path(path);
2088 if (ret || num_found >= sk->nr_items) 2132 if (ret)
2089 break; 2133 break;
2090 2134
2091 } 2135 }
2092 ret = 0; 2136 if (ret > 0)
2137 ret = 0;
2093err: 2138err:
2094 sk->nr_items = num_found; 2139 sk->nr_items = num_found;
2095 btrfs_free_path(path); 2140 btrfs_free_path(path);
@@ -2099,22 +2144,73 @@ err:
2099static noinline int btrfs_ioctl_tree_search(struct file *file, 2144static noinline int btrfs_ioctl_tree_search(struct file *file,
2100 void __user *argp) 2145 void __user *argp)
2101{ 2146{
2102 struct btrfs_ioctl_search_args *args; 2147 struct btrfs_ioctl_search_args __user *uargs;
2103 struct inode *inode; 2148 struct btrfs_ioctl_search_key sk;
2104 int ret; 2149 struct inode *inode;
2150 int ret;
2151 size_t buf_size;
2105 2152
2106 if (!capable(CAP_SYS_ADMIN)) 2153 if (!capable(CAP_SYS_ADMIN))
2107 return -EPERM; 2154 return -EPERM;
2108 2155
2109 args = memdup_user(argp, sizeof(*args)); 2156 uargs = (struct btrfs_ioctl_search_args __user *)argp;
2110 if (IS_ERR(args)) 2157
2111 return PTR_ERR(args); 2158 if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
2159 return -EFAULT;
2160
2161 buf_size = sizeof(uargs->buf);
2112 2162
2113 inode = file_inode(file); 2163 inode = file_inode(file);
2114 ret = search_ioctl(inode, args); 2164 ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
2115 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2165
2166 /*
2167 * In the origin implementation an overflow is handled by returning a
2168 * search header with a len of zero, so reset ret.
2169 */
2170 if (ret == -EOVERFLOW)
2171 ret = 0;
2172
2173 if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
2116 ret = -EFAULT; 2174 ret = -EFAULT;
2117 kfree(args); 2175 return ret;
2176}
2177
2178static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
2179 void __user *argp)
2180{
2181 struct btrfs_ioctl_search_args_v2 __user *uarg;
2182 struct btrfs_ioctl_search_args_v2 args;
2183 struct inode *inode;
2184 int ret;
2185 size_t buf_size;
2186 const size_t buf_limit = 16 * 1024 * 1024;
2187
2188 if (!capable(CAP_SYS_ADMIN))
2189 return -EPERM;
2190
2191 /* copy search header and buffer size */
2192 uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
2193 if (copy_from_user(&args, uarg, sizeof(args)))
2194 return -EFAULT;
2195
2196 buf_size = args.buf_size;
2197
2198 if (buf_size < sizeof(struct btrfs_ioctl_search_header))
2199 return -EOVERFLOW;
2200
2201 /* limit result size to 16MB */
2202 if (buf_size > buf_limit)
2203 buf_size = buf_limit;
2204
2205 inode = file_inode(file);
2206 ret = search_ioctl(inode, &args.key, &buf_size,
2207 (char *)(&uarg->buf[0]));
2208 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
2209 ret = -EFAULT;
2210 else if (ret == -EOVERFLOW &&
2211 copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
2212 ret = -EFAULT;
2213
2118 return ret; 2214 return ret;
2119} 2215}
2120 2216
@@ -3046,7 +3142,6 @@ out:
3046static void clone_update_extent_map(struct inode *inode, 3142static void clone_update_extent_map(struct inode *inode,
3047 const struct btrfs_trans_handle *trans, 3143 const struct btrfs_trans_handle *trans,
3048 const struct btrfs_path *path, 3144 const struct btrfs_path *path,
3049 struct btrfs_file_extent_item *fi,
3050 const u64 hole_offset, 3145 const u64 hole_offset,
3051 const u64 hole_len) 3146 const u64 hole_len)
3052{ 3147{
@@ -3061,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode,
3061 return; 3156 return;
3062 } 3157 }
3063 3158
3064 if (fi) { 3159 if (path) {
3160 struct btrfs_file_extent_item *fi;
3161
3162 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
3163 struct btrfs_file_extent_item);
3065 btrfs_extent_item_to_extent_map(inode, path, fi, false, em); 3164 btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
3066 em->generation = -1; 3165 em->generation = -1;
3067 if (btrfs_file_extent_type(path->nodes[0], fi) == 3166 if (btrfs_file_extent_type(path->nodes[0], fi) ==
@@ -3415,18 +3514,15 @@ process_slot:
3415 btrfs_item_ptr_offset(leaf, slot), 3514 btrfs_item_ptr_offset(leaf, slot),
3416 size); 3515 size);
3417 inode_add_bytes(inode, datal); 3516 inode_add_bytes(inode, datal);
3418 extent = btrfs_item_ptr(leaf, slot,
3419 struct btrfs_file_extent_item);
3420 } 3517 }
3421 3518
3422 /* If we have an implicit hole (NO_HOLES feature). */ 3519 /* If we have an implicit hole (NO_HOLES feature). */
3423 if (drop_start < new_key.offset) 3520 if (drop_start < new_key.offset)
3424 clone_update_extent_map(inode, trans, 3521 clone_update_extent_map(inode, trans,
3425 path, NULL, drop_start, 3522 NULL, drop_start,
3426 new_key.offset - drop_start); 3523 new_key.offset - drop_start);
3427 3524
3428 clone_update_extent_map(inode, trans, path, 3525 clone_update_extent_map(inode, trans, path, 0, 0);
3429 extent, 0, 0);
3430 3526
3431 btrfs_mark_buffer_dirty(leaf); 3527 btrfs_mark_buffer_dirty(leaf);
3432 btrfs_release_path(path); 3528 btrfs_release_path(path);
@@ -3469,12 +3565,10 @@ process_slot:
3469 btrfs_end_transaction(trans, root); 3565 btrfs_end_transaction(trans, root);
3470 goto out; 3566 goto out;
3471 } 3567 }
3568 clone_update_extent_map(inode, trans, NULL, last_dest_end,
3569 destoff + len - last_dest_end);
3472 ret = clone_finish_inode_update(trans, inode, destoff + len, 3570 ret = clone_finish_inode_update(trans, inode, destoff + len,
3473 destoff, olen); 3571 destoff, olen);
3474 if (ret)
3475 goto out;
3476 clone_update_extent_map(inode, trans, path, NULL, last_dest_end,
3477 destoff + len - last_dest_end);
3478 } 3572 }
3479 3573
3480out: 3574out:
@@ -5198,6 +5292,8 @@ long btrfs_ioctl(struct file *file, unsigned int
5198 return btrfs_ioctl_trans_end(file); 5292 return btrfs_ioctl_trans_end(file);
5199 case BTRFS_IOC_TREE_SEARCH: 5293 case BTRFS_IOC_TREE_SEARCH:
5200 return btrfs_ioctl_tree_search(file, argp); 5294 return btrfs_ioctl_tree_search(file, argp);
5295 case BTRFS_IOC_TREE_SEARCH_V2:
5296 return btrfs_ioctl_tree_search_v2(file, argp);
5201 case BTRFS_IOC_INO_LOOKUP: 5297 case BTRFS_IOC_INO_LOOKUP:
5202 return btrfs_ioctl_ino_lookup(file, argp); 5298 return btrfs_ioctl_ino_lookup(file, argp);
5203 case BTRFS_IOC_INO_PATHS: 5299 case BTRFS_IOC_INO_PATHS:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 01277b8f2373..5665d2149249 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
33 */ 33 */
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
35{ 35{
36 if (eb->lock_nested) { 36 /*
37 read_lock(&eb->lock); 37 * no lock is required. The lock owner may change if
38 if (eb->lock_nested && current->pid == eb->lock_owner) { 38 * we have a read lock, but it won't change to or away
39 read_unlock(&eb->lock); 39 * from us. If we have the write lock, we are the owner
40 return; 40 * and it'll never change.
41 } 41 */
42 read_unlock(&eb->lock); 42 if (eb->lock_nested && current->pid == eb->lock_owner)
43 } 43 return;
44 if (rw == BTRFS_WRITE_LOCK) { 44 if (rw == BTRFS_WRITE_LOCK) {
45 if (atomic_read(&eb->blocking_writers) == 0) { 45 if (atomic_read(&eb->blocking_writers) == 0) {
46 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 46 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
65 */ 65 */
66void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) 66void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
67{ 67{
68 if (eb->lock_nested) { 68 /*
69 read_lock(&eb->lock); 69 * no lock is required. The lock owner may change if
70 if (eb->lock_nested && current->pid == eb->lock_owner) { 70 * we have a read lock, but it won't change to or away
71 read_unlock(&eb->lock); 71 * from us. If we have the write lock, we are the owner
72 return; 72 * and it'll never change.
73 } 73 */
74 read_unlock(&eb->lock); 74 if (eb->lock_nested && current->pid == eb->lock_owner)
75 } 75 return;
76
76 if (rw == BTRFS_WRITE_LOCK_BLOCKING) { 77 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
77 BUG_ON(atomic_read(&eb->blocking_writers) != 1); 78 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
78 write_lock(&eb->lock); 79 write_lock(&eb->lock);
@@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
99void btrfs_tree_read_lock(struct extent_buffer *eb) 100void btrfs_tree_read_lock(struct extent_buffer *eb)
100{ 101{
101again: 102again:
103 BUG_ON(!atomic_read(&eb->blocking_writers) &&
104 current->pid == eb->lock_owner);
105
102 read_lock(&eb->lock); 106 read_lock(&eb->lock);
103 if (atomic_read(&eb->blocking_writers) && 107 if (atomic_read(&eb->blocking_writers) &&
104 current->pid == eb->lock_owner) { 108 current->pid == eb->lock_owner) {
@@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
132 if (atomic_read(&eb->blocking_writers)) 136 if (atomic_read(&eb->blocking_writers))
133 return 0; 137 return 0;
134 138
135 read_lock(&eb->lock); 139 if (!read_trylock(&eb->lock))
140 return 0;
141
136 if (atomic_read(&eb->blocking_writers)) { 142 if (atomic_read(&eb->blocking_writers)) {
137 read_unlock(&eb->lock); 143 read_unlock(&eb->lock);
138 return 0; 144 return 0;
@@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
151 if (atomic_read(&eb->blocking_writers) || 157 if (atomic_read(&eb->blocking_writers) ||
152 atomic_read(&eb->blocking_readers)) 158 atomic_read(&eb->blocking_readers))
153 return 0; 159 return 0;
154 write_lock(&eb->lock); 160
161 if (!write_trylock(&eb->lock))
162 return 0;
163
155 if (atomic_read(&eb->blocking_writers) || 164 if (atomic_read(&eb->blocking_writers) ||
156 atomic_read(&eb->blocking_readers)) { 165 atomic_read(&eb->blocking_readers)) {
157 write_unlock(&eb->lock); 166 write_unlock(&eb->lock);
@@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
168 */ 177 */
169void btrfs_tree_read_unlock(struct extent_buffer *eb) 178void btrfs_tree_read_unlock(struct extent_buffer *eb)
170{ 179{
171 if (eb->lock_nested) { 180 /*
172 read_lock(&eb->lock); 181 * if we're nested, we have the write lock. No new locking
173 if (eb->lock_nested && current->pid == eb->lock_owner) { 182 * is needed as long as we are the lock owner.
174 eb->lock_nested = 0; 183 * The write unlock will do a barrier for us, and the lock_nested
175 read_unlock(&eb->lock); 184 * field only matters to the lock owner.
176 return; 185 */
177 } 186 if (eb->lock_nested && current->pid == eb->lock_owner) {
178 read_unlock(&eb->lock); 187 eb->lock_nested = 0;
188 return;
179 } 189 }
180 btrfs_assert_tree_read_locked(eb); 190 btrfs_assert_tree_read_locked(eb);
181 WARN_ON(atomic_read(&eb->spinning_readers) == 0); 191 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
@@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
189 */ 199 */
190void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) 200void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
191{ 201{
192 if (eb->lock_nested) { 202 /*
193 read_lock(&eb->lock); 203 * if we're nested, we have the write lock. No new locking
194 if (eb->lock_nested && current->pid == eb->lock_owner) { 204 * is needed as long as we are the lock owner.
195 eb->lock_nested = 0; 205 * The write unlock will do a barrier for us, and the lock_nested
196 read_unlock(&eb->lock); 206 * field only matters to the lock owner.
197 return; 207 */
198 } 208 if (eb->lock_nested && current->pid == eb->lock_owner) {
199 read_unlock(&eb->lock); 209 eb->lock_nested = 0;
210 return;
200 } 211 }
201 btrfs_assert_tree_read_locked(eb); 212 btrfs_assert_tree_read_locked(eb);
202 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 213 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
@@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
244 BUG_ON(blockers > 1); 255 BUG_ON(blockers > 1);
245 256
246 btrfs_assert_tree_locked(eb); 257 btrfs_assert_tree_locked(eb);
258 eb->lock_owner = 0;
247 atomic_dec(&eb->write_locks); 259 atomic_dec(&eb->write_locks);
248 260
249 if (blockers) { 261 if (blockers) {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6efd70d3b64f..9626b4ad3b9a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb,
54 btrfs_extent_data_ref_count(eb, ref)); 54 btrfs_extent_data_ref_count(eb, ref));
55} 55}
56 56
57static void print_extent_item(struct extent_buffer *eb, int slot) 57static void print_extent_item(struct extent_buffer *eb, int slot, int type)
58{ 58{
59 struct btrfs_extent_item *ei; 59 struct btrfs_extent_item *ei;
60 struct btrfs_extent_inline_ref *iref; 60 struct btrfs_extent_inline_ref *iref;
@@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
63 struct btrfs_disk_key key; 63 struct btrfs_disk_key key;
64 unsigned long end; 64 unsigned long end;
65 unsigned long ptr; 65 unsigned long ptr;
66 int type;
67 u32 item_size = btrfs_item_size_nr(eb, slot); 66 u32 item_size = btrfs_item_size_nr(eb, slot);
68 u64 flags; 67 u64 flags;
69 u64 offset; 68 u64 offset;
@@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
88 btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), 87 btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
89 flags); 88 flags);
90 89
91 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 90 if ((type == BTRFS_EXTENT_ITEM_KEY) &&
91 flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
92 struct btrfs_tree_block_info *info; 92 struct btrfs_tree_block_info *info;
93 info = (struct btrfs_tree_block_info *)(ei + 1); 93 info = (struct btrfs_tree_block_info *)(ei + 1);
94 btrfs_tree_block_key(eb, info, &key); 94 btrfs_tree_block_key(eb, info, &key);
@@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
223 btrfs_disk_root_refs(l, ri)); 223 btrfs_disk_root_refs(l, ri));
224 break; 224 break;
225 case BTRFS_EXTENT_ITEM_KEY: 225 case BTRFS_EXTENT_ITEM_KEY:
226 print_extent_item(l, i); 226 case BTRFS_METADATA_ITEM_KEY:
227 print_extent_item(l, i, type);
227 break; 228 break;
228 case BTRFS_TREE_BLOCK_REF_KEY: 229 case BTRFS_TREE_BLOCK_REF_KEY:
229 printk(KERN_INFO "\t\ttree block backref\n"); 230 printk(KERN_INFO "\t\ttree block backref\n");
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cf5aead95a7f..98cb6b2630f9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1798,8 +1798,10 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
1798 return -ENOMEM; 1798 return -ENOMEM;
1799 1799
1800 tmp = ulist_alloc(GFP_NOFS); 1800 tmp = ulist_alloc(GFP_NOFS);
1801 if (!tmp) 1801 if (!tmp) {
1802 ulist_free(qgroups);
1802 return -ENOMEM; 1803 return -ENOMEM;
1804 }
1803 1805
1804 btrfs_get_tree_mod_seq(fs_info, &elem); 1806 btrfs_get_tree_mod_seq(fs_info, &elem);
1805 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, 1807 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4055291a523e..4a88f073fdd7 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1956 * pages are going to be uptodate. 1956 * pages are going to be uptodate.
1957 */ 1957 */
1958 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1958 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1959 if (rbio->faila == stripe || 1959 if (rbio->faila == stripe || rbio->failb == stripe) {
1960 rbio->failb == stripe) 1960 atomic_inc(&rbio->bbio->error);
1961 continue; 1961 continue;
1962 }
1962 1963
1963 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1964 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1964 struct page *p; 1965 struct page *p;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 30947f923620..09230cf3a244 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -428,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
428 continue; 428 continue;
429 } 429 }
430 if (!dev->bdev) { 430 if (!dev->bdev) {
431 /* cannot read ahead on missing device */ 431 /*
432 continue; 432 * cannot read ahead on missing device, but for RAID5/6,
433 * REQ_GET_READ_MIRRORS return 1. So don't skip missing
434 * device for such case.
435 */
436 if (nzones > 1)
437 continue;
433 } 438 }
434 if (dev_replace_is_ongoing && 439 if (dev_replace_is_ongoing &&
435 dev == fs_info->dev_replace.tgtdev) { 440 dev == fs_info->dev_replace.tgtdev) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ac80188eec88..b6d198f5181e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2725,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2725 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2725 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2726 length = btrfs_dev_extent_length(l, dev_extent); 2726 length = btrfs_dev_extent_length(l, dev_extent);
2727 2727
2728 if (found_key.offset + length <= start) { 2728 if (found_key.offset + length <= start)
2729 key.offset = found_key.offset + length; 2729 goto skip;
2730 btrfs_release_path(path);
2731 continue;
2732 }
2733 2730
2734 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2731 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2735 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2732 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2740,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2740 * the chunk from going away while we scrub it 2737 * the chunk from going away while we scrub it
2741 */ 2738 */
2742 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2739 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2743 if (!cache) { 2740
2744 ret = -ENOENT; 2741 /* some chunks are removed but not committed to disk yet,
2745 break; 2742 * continue scrubbing */
2746 } 2743 if (!cache)
2744 goto skip;
2745
2747 dev_replace->cursor_right = found_key.offset + length; 2746 dev_replace->cursor_right = found_key.offset + length;
2748 dev_replace->cursor_left = found_key.offset; 2747 dev_replace->cursor_left = found_key.offset;
2749 dev_replace->item_needs_writeback = 1; 2748 dev_replace->item_needs_writeback = 1;
@@ -2802,7 +2801,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2802 2801
2803 dev_replace->cursor_left = dev_replace->cursor_right; 2802 dev_replace->cursor_left = dev_replace->cursor_right;
2804 dev_replace->item_needs_writeback = 1; 2803 dev_replace->item_needs_writeback = 1;
2805 2804skip:
2806 key.offset = found_key.offset + length; 2805 key.offset = found_key.offset + length;
2807 btrfs_release_path(path); 2806 btrfs_release_path(path);
2808 } 2807 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4662d92a4b73..8e16bca69c56 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
522 case Opt_ssd_spread: 522 case Opt_ssd_spread:
523 btrfs_set_and_info(root, SSD_SPREAD, 523 btrfs_set_and_info(root, SSD_SPREAD,
524 "use spread ssd allocation scheme"); 524 "use spread ssd allocation scheme");
525 btrfs_set_opt(info->mount_opt, SSD);
525 break; 526 break;
526 case Opt_nossd: 527 case Opt_nossd:
527 btrfs_clear_and_info(root, NOSSD, 528 btrfs_set_and_info(root, NOSSD,
528 "not using ssd allocation scheme"); 529 "not using ssd allocation scheme");
529 btrfs_clear_opt(info->mount_opt, SSD); 530 btrfs_clear_opt(info->mount_opt, SSD);
530 break; 531 break;
@@ -1467,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1467 goto restore; 1468 goto restore;
1468 1469
1469 /* recover relocation */ 1470 /* recover relocation */
1471 mutex_lock(&fs_info->cleaner_mutex);
1470 ret = btrfs_recover_relocation(root); 1472 ret = btrfs_recover_relocation(root);
1473 mutex_unlock(&fs_info->cleaner_mutex);
1471 if (ret) 1474 if (ret)
1472 goto restore; 1475 goto restore;
1473 1476
@@ -1808,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
1808 list_for_each_entry(dev, head, dev_list) { 1811 list_for_each_entry(dev, head, dev_list) {
1809 if (dev->missing) 1812 if (dev->missing)
1810 continue; 1813 continue;
1814 if (!dev->name)
1815 continue;
1811 if (!first_dev || dev->devid < first_dev->devid) 1816 if (!first_dev || dev->devid < first_dev->devid)
1812 first_dev = dev; 1817 first_dev = dev;
1813 } 1818 }
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index df39458f1487..78699364f537 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -605,14 +605,37 @@ static void init_feature_attrs(void)
605 } 605 }
606} 606}
607 607
608static int add_device_membership(struct btrfs_fs_info *fs_info) 608int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
609 struct btrfs_device *one_device)
610{
611 struct hd_struct *disk;
612 struct kobject *disk_kobj;
613
614 if (!fs_info->device_dir_kobj)
615 return -EINVAL;
616
617 if (one_device) {
618 disk = one_device->bdev->bd_part;
619 disk_kobj = &part_to_dev(disk)->kobj;
620
621 sysfs_remove_link(fs_info->device_dir_kobj,
622 disk_kobj->name);
623 }
624
625 return 0;
626}
627
628int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
629 struct btrfs_device *one_device)
609{ 630{
610 int error = 0; 631 int error = 0;
611 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 632 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
612 struct btrfs_device *dev; 633 struct btrfs_device *dev;
613 634
614 fs_info->device_dir_kobj = kobject_create_and_add("devices", 635 if (!fs_info->device_dir_kobj)
636 fs_info->device_dir_kobj = kobject_create_and_add("devices",
615 &fs_info->super_kobj); 637 &fs_info->super_kobj);
638
616 if (!fs_info->device_dir_kobj) 639 if (!fs_info->device_dir_kobj)
617 return -ENOMEM; 640 return -ENOMEM;
618 641
@@ -623,6 +646,9 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
623 if (!dev->bdev) 646 if (!dev->bdev)
624 continue; 647 continue;
625 648
649 if (one_device && one_device != dev)
650 continue;
651
626 disk = dev->bdev->bd_part; 652 disk = dev->bdev->bd_part;
627 disk_kobj = &part_to_dev(disk)->kobj; 653 disk_kobj = &part_to_dev(disk)->kobj;
628 654
@@ -666,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
666 if (error) 692 if (error)
667 goto failure; 693 goto failure;
668 694
669 error = add_device_membership(fs_info); 695 error = btrfs_kobj_add_device(fs_info, NULL);
670 if (error) 696 if (error)
671 goto failure; 697 goto failure;
672 698
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 9ab576318a84..ac46df37504c 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -66,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
66extern const char * const btrfs_feature_set_names[3]; 66extern const char * const btrfs_feature_set_names[3];
67extern struct kobj_type space_info_ktype; 67extern struct kobj_type space_info_ktype;
68extern struct kobj_type btrfs_raid_ktype; 68extern struct kobj_type btrfs_raid_ktype;
69int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
70 struct btrfs_device *one_device);
71int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
72 struct btrfs_device *one_device);
69#endif /* _BTRFS_SYSFS_H_ */ 73#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index a5dcacb5df9c..9626252ee6b4 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -135,7 +135,7 @@ restart:
135 radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { 135 radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
136 struct extent_buffer *eb; 136 struct extent_buffer *eb;
137 137
138 eb = radix_tree_deref_slot(slot); 138 eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
139 if (!eb) 139 if (!eb)
140 continue; 140 continue;
141 /* Shouldn't happen but that kind of thinking creates CVE's */ 141 /* Shouldn't happen but that kind of thinking creates CVE's */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index fa691b754aaf..ec3dcb202357 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -415,6 +415,8 @@ int btrfs_test_qgroups(void)
415 ret = -ENOMEM; 415 ret = -ENOMEM;
416 goto out; 416 goto out;
417 } 417 }
418 btrfs_set_header_level(root->node, 0);
419 btrfs_set_header_nritems(root->node, 0);
418 root->alloc_bytenr += 8192; 420 root->alloc_bytenr += 8192;
419 421
420 tmp_root = btrfs_alloc_dummy_root(); 422 tmp_root = btrfs_alloc_dummy_root();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9630f10f8e1e..5f379affdf23 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
386 bool reloc_reserved = false; 386 bool reloc_reserved = false;
387 int ret; 387 int ret;
388 388
389 /* Send isn't supposed to start transactions. */
390 ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
391
389 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 392 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
390 return ERR_PTR(-EROFS); 393 return ERR_PTR(-EROFS);
391 394
392 if (current->journal_info && 395 if (current->journal_info) {
393 current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) {
394 WARN_ON(type & TRANS_EXTWRITERS); 396 WARN_ON(type & TRANS_EXTWRITERS);
395 h = current->journal_info; 397 h = current->journal_info;
396 h->use_count++; 398 h->use_count++;
@@ -491,6 +493,7 @@ again:
491 smp_mb(); 493 smp_mb();
492 if (cur_trans->state >= TRANS_STATE_BLOCKED && 494 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
493 may_wait_transaction(root, type)) { 495 may_wait_transaction(root, type)) {
496 current->journal_info = h;
494 btrfs_commit_transaction(h, root); 497 btrfs_commit_transaction(h, root);
495 goto again; 498 goto again;
496 } 499 }
@@ -1284,11 +1287,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1284 goto fail; 1287 goto fail;
1285 } 1288 }
1286 1289
1287 pending->error = btrfs_qgroup_inherit(trans, fs_info, 1290 ret = btrfs_qgroup_inherit(trans, fs_info,
1288 root->root_key.objectid, 1291 root->root_key.objectid,
1289 objectid, pending->inherit); 1292 objectid, pending->inherit);
1290 if (pending->error) 1293 if (ret) {
1291 goto no_free_objectid; 1294 btrfs_abort_transaction(trans, root, ret);
1295 goto fail;
1296 }
1292 1297
1293 /* see comments in should_cow_block() */ 1298 /* see comments in should_cow_block() */
1294 set_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1299 set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
@@ -1613,11 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1613 int ret; 1618 int ret;
1614 1619
1615 ret = btrfs_run_delayed_items(trans, root); 1620 ret = btrfs_run_delayed_items(trans, root);
1616 /*
1617 * running the delayed items may have added new refs. account
1618 * them now so that they hinder processing of more delayed refs
1619 * as little as possible.
1620 */
1621 if (ret) 1621 if (ret)
1622 return ret; 1622 return ret;
1623 1623
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ffeed6d6326f..6104676857f5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,7 @@
40#include "rcu-string.h" 40#include "rcu-string.h"
41#include "math.h" 41#include "math.h"
42#include "dev-replace.h" 42#include "dev-replace.h"
43#include "sysfs.h"
43 44
44static int init_first_rw_device(struct btrfs_trans_handle *trans, 45static int init_first_rw_device(struct btrfs_trans_handle *trans,
45 struct btrfs_root *root, 46 struct btrfs_root *root,
@@ -554,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
554 * This is ok to do without rcu read locked because we hold the 555 * This is ok to do without rcu read locked because we hold the
555 * uuid mutex so nothing we touch in here is going to disappear. 556 * uuid mutex so nothing we touch in here is going to disappear.
556 */ 557 */
557 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 558 if (orig_dev->name) {
558 if (!name) { 559 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
559 kfree(device); 560 if (!name) {
560 goto error; 561 kfree(device);
562 goto error;
563 }
564 rcu_assign_pointer(device->name, name);
561 } 565 }
562 rcu_assign_pointer(device->name, name);
563 566
564 list_add(&device->dev_list, &fs_devices->devices); 567 list_add(&device->dev_list, &fs_devices->devices);
565 device->fs_devices = fs_devices; 568 device->fs_devices = fs_devices;
@@ -1680,6 +1683,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1680 if (device->bdev) 1683 if (device->bdev)
1681 device->fs_devices->open_devices--; 1684 device->fs_devices->open_devices--;
1682 1685
1686 /* remove sysfs entry */
1687 btrfs_kobj_rm_device(root->fs_info, device);
1688
1683 call_rcu(&device->rcu, free_device); 1689 call_rcu(&device->rcu, free_device);
1684 1690
1685 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1691 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
@@ -2143,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2143 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 2149 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
2144 btrfs_set_super_num_devices(root->fs_info->super_copy, 2150 btrfs_set_super_num_devices(root->fs_info->super_copy,
2145 total_bytes + 1); 2151 total_bytes + 1);
2152
2153 /* add sysfs device entry */
2154 btrfs_kobj_add_device(root->fs_info, device);
2155
2146 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2156 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2147 2157
2148 if (seeding_dev) { 2158 if (seeding_dev) {
2159 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2149 ret = init_first_rw_device(trans, root, device); 2160 ret = init_first_rw_device(trans, root, device);
2150 if (ret) { 2161 if (ret) {
2151 btrfs_abort_transaction(trans, root, ret); 2162 btrfs_abort_transaction(trans, root, ret);
@@ -2156,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2156 btrfs_abort_transaction(trans, root, ret); 2167 btrfs_abort_transaction(trans, root, ret);
2157 goto error_trans; 2168 goto error_trans;
2158 } 2169 }
2170
2171 /* Sprouting would change fsid of the mounted root,
2172 * so rename the fsid on the sysfs
2173 */
2174 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2175 root->fs_info->fsid);
2176 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
2177 goto error_trans;
2159 } else { 2178 } else {
2160 ret = btrfs_add_device(trans, root, device); 2179 ret = btrfs_add_device(trans, root, device);
2161 if (ret) { 2180 if (ret) {
@@ -2205,6 +2224,7 @@ error_trans:
2205 unlock_chunks(root); 2224 unlock_chunks(root);
2206 btrfs_end_transaction(trans, root); 2225 btrfs_end_transaction(trans, root);
2207 rcu_string_free(device->name); 2226 rcu_string_free(device->name);
2227 btrfs_kobj_rm_device(root->fs_info, device);
2208 kfree(device); 2228 kfree(device);
2209error: 2229error:
2210 blkdev_put(bdev, FMODE_EXCL); 2230 blkdev_put(bdev, FMODE_EXCL);
@@ -2543,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
2543 remove_extent_mapping(em_tree, em); 2563 remove_extent_mapping(em_tree, em);
2544 write_unlock(&em_tree->lock); 2564 write_unlock(&em_tree->lock);
2545 2565
2546 kfree(map);
2547 em->bdev = NULL;
2548
2549 /* once for the tree */ 2566 /* once for the tree */
2550 free_extent_map(em); 2567 free_extent_map(em);
2551 /* once for us */ 2568 /* once for us */
@@ -4301,9 +4318,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4301 4318
4302 em = alloc_extent_map(); 4319 em = alloc_extent_map();
4303 if (!em) { 4320 if (!em) {
4321 kfree(map);
4304 ret = -ENOMEM; 4322 ret = -ENOMEM;
4305 goto error; 4323 goto error;
4306 } 4324 }
4325 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4307 em->bdev = (struct block_device *)map; 4326 em->bdev = (struct block_device *)map;
4308 em->start = start; 4327 em->start = start;
4309 em->len = num_bytes; 4328 em->len = num_bytes;
@@ -4346,7 +4365,6 @@ error_del_extent:
4346 /* One for the tree reference */ 4365 /* One for the tree reference */
4347 free_extent_map(em); 4366 free_extent_map(em);
4348error: 4367error:
4349 kfree(map);
4350 kfree(devices_info); 4368 kfree(devices_info);
4351 return ret; 4369 return ret;
4352} 4370}
@@ -4558,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
4558 write_unlock(&tree->map_tree.lock); 4576 write_unlock(&tree->map_tree.lock);
4559 if (!em) 4577 if (!em)
4560 break; 4578 break;
4561 kfree(em->bdev);
4562 /* once for us */ 4579 /* once for us */
4563 free_extent_map(em); 4580 free_extent_map(em);
4564 /* once for the tree */ 4581 /* once for the tree */
@@ -5362,6 +5379,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5362 return 0; 5379 return 0;
5363} 5380}
5364 5381
5382static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
5383{
5384 if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
5385 bio_endio_nodec(bio, err);
5386 else
5387 bio_endio(bio, err);
5388 kfree(bbio);
5389}
5390
5365static void btrfs_end_bio(struct bio *bio, int err) 5391static void btrfs_end_bio(struct bio *bio, int err)
5366{ 5392{
5367 struct btrfs_bio *bbio = bio->bi_private; 5393 struct btrfs_bio *bbio = bio->bi_private;
@@ -5402,12 +5428,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5402 bio = bbio->orig_bio; 5428 bio = bbio->orig_bio;
5403 } 5429 }
5404 5430
5405 /*
5406 * We have original bio now. So increment bi_remaining to
5407 * account for it in endio
5408 */
5409 atomic_inc(&bio->bi_remaining);
5410
5411 bio->bi_private = bbio->private; 5431 bio->bi_private = bbio->private;
5412 bio->bi_end_io = bbio->end_io; 5432 bio->bi_end_io = bbio->end_io;
5413 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5433 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5424,9 +5444,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
5424 set_bit(BIO_UPTODATE, &bio->bi_flags); 5444 set_bit(BIO_UPTODATE, &bio->bi_flags);
5425 err = 0; 5445 err = 0;
5426 } 5446 }
5427 kfree(bbio);
5428 5447
5429 bio_endio(bio, err); 5448 btrfs_end_bbio(bbio, bio, err);
5430 } else if (!is_orig_bio) { 5449 } else if (!is_orig_bio) {
5431 bio_put(bio); 5450 bio_put(bio);
5432 } 5451 }
@@ -5589,12 +5608,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5589{ 5608{
5590 atomic_inc(&bbio->error); 5609 atomic_inc(&bbio->error);
5591 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5610 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5611 /* Shoud be the original bio. */
5612 WARN_ON(bio != bbio->orig_bio);
5613
5592 bio->bi_private = bbio->private; 5614 bio->bi_private = bbio->private;
5593 bio->bi_end_io = bbio->end_io; 5615 bio->bi_end_io = bbio->end_io;
5594 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5616 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5595 bio->bi_iter.bi_sector = logical >> 9; 5617 bio->bi_iter.bi_sector = logical >> 9;
5596 kfree(bbio); 5618
5597 bio_endio(bio, -EIO); 5619 btrfs_end_bbio(bbio, bio, -EIO);
5598 } 5620 }
5599} 5621}
5600 5622
@@ -5681,6 +5703,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5681 BUG_ON(!bio); /* -ENOMEM */ 5703 BUG_ON(!bio); /* -ENOMEM */
5682 } else { 5704 } else {
5683 bio = first_bio; 5705 bio = first_bio;
5706 bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
5684 } 5707 }
5685 5708
5686 submit_stripe_bio(root, bbio, bio, 5709 submit_stripe_bio(root, bbio, bio,
@@ -5822,6 +5845,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5822 return -ENOMEM; 5845 return -ENOMEM;
5823 } 5846 }
5824 5847
5848 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5825 em->bdev = (struct block_device *)map; 5849 em->bdev = (struct block_device *)map;
5826 em->start = logical; 5850 em->start = logical;
5827 em->len = length; 5851 em->len = length;
@@ -5846,7 +5870,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5846 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, 5870 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
5847 uuid, NULL); 5871 uuid, NULL);
5848 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 5872 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
5849 kfree(map);
5850 free_extent_map(em); 5873 free_extent_map(em);
5851 return -EIO; 5874 return -EIO;
5852 } 5875 }
@@ -5854,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5854 map->stripes[i].dev = 5877 map->stripes[i].dev =
5855 add_missing_dev(root, devid, uuid); 5878 add_missing_dev(root, devid, uuid);
5856 if (!map->stripes[i].dev) { 5879 if (!map->stripes[i].dev) {
5857 kfree(map);
5858 free_extent_map(em); 5880 free_extent_map(em);
5859 return -EIO; 5881 return -EIO;
5860 } 5882 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1a15bbeb65e2..2aaa00c47816 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -190,11 +190,14 @@ struct btrfs_bio_stripe {
190struct btrfs_bio; 190struct btrfs_bio;
191typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); 191typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
192 192
193#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1
194
193struct btrfs_bio { 195struct btrfs_bio {
194 atomic_t stripes_pending; 196 atomic_t stripes_pending;
195 struct btrfs_fs_info *fs_info; 197 struct btrfs_fs_info *fs_info;
196 bio_end_io_t *end_io; 198 bio_end_io_t *end_io;
197 struct bio *orig_bio; 199 struct bio *orig_bio;
200 unsigned long flags;
198 void *private; 201 void *private;
199 atomic_t error; 202 atomic_t error;
200 int max_errors; 203 int max_errors;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 4f196314c0c1..b67d8fc81277 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
136 if (workspace->def_strm.total_in > 8192 && 136 if (workspace->def_strm.total_in > 8192 &&
137 workspace->def_strm.total_in < 137 workspace->def_strm.total_in <
138 workspace->def_strm.total_out) { 138 workspace->def_strm.total_out) {
139 ret = -EIO; 139 ret = -E2BIG;
140 goto out; 140 goto out;
141 } 141 }
142 /* we need another page for writing out. Test this 142 /* we need another page for writing out. Test this
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 21887d63dad5..469f2e8657e8 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -104,12 +104,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
104 umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; 104 umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
105 struct dentry *dentry; 105 struct dentry *dentry;
106 106
107 if (acl) {
108 ret = posix_acl_valid(acl);
109 if (ret < 0)
110 goto out;
111 }
112
113 switch (type) { 107 switch (type) {
114 case ACL_TYPE_ACCESS: 108 case ACL_TYPE_ACCESS:
115 name = POSIX_ACL_XATTR_ACCESS; 109 name = POSIX_ACL_XATTR_ACCESS;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 65a30e817dd8..90b3954d48ed 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -211,18 +211,15 @@ static int readpage_nounlock(struct file *filp, struct page *page)
211 SetPageError(page); 211 SetPageError(page);
212 ceph_fscache_readpage_cancel(inode, page); 212 ceph_fscache_readpage_cancel(inode, page);
213 goto out; 213 goto out;
214 } else {
215 if (err < PAGE_CACHE_SIZE) {
216 /* zero fill remainder of page */
217 zero_user_segment(page, err, PAGE_CACHE_SIZE);
218 } else {
219 flush_dcache_page(page);
220 }
221 } 214 }
222 SetPageUptodate(page); 215 if (err < PAGE_CACHE_SIZE)
216 /* zero fill remainder of page */
217 zero_user_segment(page, err, PAGE_CACHE_SIZE);
218 else
219 flush_dcache_page(page);
223 220
224 if (err >= 0) 221 SetPageUptodate(page);
225 ceph_readpage_to_fscache(inode, page); 222 ceph_readpage_to_fscache(inode, page);
226 223
227out: 224out:
228 return err < 0 ? err : 0; 225 return err < 0 ? err : 0;
@@ -1187,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1187 * never get called. 1184 * never get called.
1188 */ 1185 */
1189static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, 1186static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1190 const struct iovec *iov, 1187 struct iov_iter *iter,
1191 loff_t pos, unsigned long nr_segs) 1188 loff_t pos)
1192{ 1189{
1193 WARN_ON(1); 1190 WARN_ON(1);
1194 return -EINVAL; 1191 return -EINVAL;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c561b628ebce..1fde164b74b5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -221,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
221 return 0; 221 return 0;
222} 222}
223 223
224static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, 224struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
225 struct ceph_cap_reservation *ctx) 225 struct ceph_cap_reservation *ctx)
226{ 226{
227 struct ceph_cap *cap = NULL; 227 struct ceph_cap *cap = NULL;
228 228
@@ -508,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
508 * it is < 0. (This is so we can atomically add the cap and add an 508 * it is < 0. (This is so we can atomically add the cap and add an
509 * open file reference to it.) 509 * open file reference to it.)
510 */ 510 */
511int ceph_add_cap(struct inode *inode, 511void ceph_add_cap(struct inode *inode,
512 struct ceph_mds_session *session, u64 cap_id, 512 struct ceph_mds_session *session, u64 cap_id,
513 int fmode, unsigned issued, unsigned wanted, 513 int fmode, unsigned issued, unsigned wanted,
514 unsigned seq, unsigned mseq, u64 realmino, int flags, 514 unsigned seq, unsigned mseq, u64 realmino, int flags,
515 struct ceph_cap_reservation *caps_reservation) 515 struct ceph_cap **new_cap)
516{ 516{
517 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 517 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
518 struct ceph_inode_info *ci = ceph_inode(inode); 518 struct ceph_inode_info *ci = ceph_inode(inode);
519 struct ceph_cap *new_cap = NULL;
520 struct ceph_cap *cap; 519 struct ceph_cap *cap;
521 int mds = session->s_mds; 520 int mds = session->s_mds;
522 int actual_wanted; 521 int actual_wanted;
@@ -531,20 +530,10 @@ int ceph_add_cap(struct inode *inode,
531 if (fmode >= 0) 530 if (fmode >= 0)
532 wanted |= ceph_caps_for_mode(fmode); 531 wanted |= ceph_caps_for_mode(fmode);
533 532
534retry:
535 spin_lock(&ci->i_ceph_lock);
536 cap = __get_cap_for_mds(ci, mds); 533 cap = __get_cap_for_mds(ci, mds);
537 if (!cap) { 534 if (!cap) {
538 if (new_cap) { 535 cap = *new_cap;
539 cap = new_cap; 536 *new_cap = NULL;
540 new_cap = NULL;
541 } else {
542 spin_unlock(&ci->i_ceph_lock);
543 new_cap = get_cap(mdsc, caps_reservation);
544 if (new_cap == NULL)
545 return -ENOMEM;
546 goto retry;
547 }
548 537
549 cap->issued = 0; 538 cap->issued = 0;
550 cap->implemented = 0; 539 cap->implemented = 0;
@@ -562,9 +551,6 @@ retry:
562 session->s_nr_caps++; 551 session->s_nr_caps++;
563 spin_unlock(&session->s_cap_lock); 552 spin_unlock(&session->s_cap_lock);
564 } else { 553 } else {
565 if (new_cap)
566 ceph_put_cap(mdsc, new_cap);
567
568 /* 554 /*
569 * auth mds of the inode changed. we received the cap export 555 * auth mds of the inode changed. we received the cap export
570 * message, but still haven't received the cap import message. 556 * message, but still haven't received the cap import message.
@@ -626,7 +612,6 @@ retry:
626 ci->i_auth_cap = cap; 612 ci->i_auth_cap = cap;
627 cap->mds_wanted = wanted; 613 cap->mds_wanted = wanted;
628 } 614 }
629 ci->i_cap_exporting_issued = 0;
630 } else { 615 } else {
631 WARN_ON(ci->i_auth_cap == cap); 616 WARN_ON(ci->i_auth_cap == cap);
632 } 617 }
@@ -648,9 +633,6 @@ retry:
648 633
649 if (fmode >= 0) 634 if (fmode >= 0)
650 __ceph_get_fmode(ci, fmode); 635 __ceph_get_fmode(ci, fmode);
651 spin_unlock(&ci->i_ceph_lock);
652 wake_up_all(&ci->i_cap_wq);
653 return 0;
654} 636}
655 637
656/* 638/*
@@ -685,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
685 */ 667 */
686int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) 668int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
687{ 669{
688 int have = ci->i_snap_caps | ci->i_cap_exporting_issued; 670 int have = ci->i_snap_caps;
689 struct ceph_cap *cap; 671 struct ceph_cap *cap;
690 struct rb_node *p; 672 struct rb_node *p;
691 673
@@ -900,7 +882,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
900 */ 882 */
901static int __ceph_is_any_caps(struct ceph_inode_info *ci) 883static int __ceph_is_any_caps(struct ceph_inode_info *ci)
902{ 884{
903 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued; 885 return !RB_EMPTY_ROOT(&ci->i_caps);
904} 886}
905 887
906int ceph_is_any_caps(struct inode *inode) 888int ceph_is_any_caps(struct inode *inode)
@@ -2397,32 +2379,30 @@ static void invalidate_aliases(struct inode *inode)
2397 * actually be a revocation if it specifies a smaller cap set.) 2379 * actually be a revocation if it specifies a smaller cap set.)
2398 * 2380 *
2399 * caller holds s_mutex and i_ceph_lock, we drop both. 2381 * caller holds s_mutex and i_ceph_lock, we drop both.
2400 *
2401 * return value:
2402 * 0 - ok
2403 * 1 - check_caps on auth cap only (writeback)
2404 * 2 - check_caps (ack revoke)
2405 */ 2382 */
2406static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, 2383static void handle_cap_grant(struct ceph_mds_client *mdsc,
2384 struct inode *inode, struct ceph_mds_caps *grant,
2385 void *snaptrace, int snaptrace_len,
2386 struct ceph_buffer *xattr_buf,
2407 struct ceph_mds_session *session, 2387 struct ceph_mds_session *session,
2408 struct ceph_cap *cap, 2388 struct ceph_cap *cap, int issued)
2409 struct ceph_buffer *xattr_buf) 2389 __releases(ci->i_ceph_lock)
2410 __releases(ci->i_ceph_lock)
2411{ 2390{
2412 struct ceph_inode_info *ci = ceph_inode(inode); 2391 struct ceph_inode_info *ci = ceph_inode(inode);
2413 int mds = session->s_mds; 2392 int mds = session->s_mds;
2414 int seq = le32_to_cpu(grant->seq); 2393 int seq = le32_to_cpu(grant->seq);
2415 int newcaps = le32_to_cpu(grant->caps); 2394 int newcaps = le32_to_cpu(grant->caps);
2416 int issued, implemented, used, wanted, dirty; 2395 int used, wanted, dirty;
2417 u64 size = le64_to_cpu(grant->size); 2396 u64 size = le64_to_cpu(grant->size);
2418 u64 max_size = le64_to_cpu(grant->max_size); 2397 u64 max_size = le64_to_cpu(grant->max_size);
2419 struct timespec mtime, atime, ctime; 2398 struct timespec mtime, atime, ctime;
2420 int check_caps = 0; 2399 int check_caps = 0;
2421 int wake = 0; 2400 bool wake = 0;
2422 int writeback = 0; 2401 bool writeback = 0;
2423 int queue_invalidate = 0; 2402 bool queue_trunc = 0;
2424 int deleted_inode = 0; 2403 bool queue_invalidate = 0;
2425 int queue_revalidate = 0; 2404 bool queue_revalidate = 0;
2405 bool deleted_inode = 0;
2426 2406
2427 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2407 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2428 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2408 inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2466,16 +2446,13 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2466 } 2446 }
2467 2447
2468 /* side effects now are allowed */ 2448 /* side effects now are allowed */
2469
2470 issued = __ceph_caps_issued(ci, &implemented);
2471 issued |= implemented | __ceph_caps_dirty(ci);
2472
2473 cap->cap_gen = session->s_cap_gen; 2449 cap->cap_gen = session->s_cap_gen;
2474 cap->seq = seq; 2450 cap->seq = seq;
2475 2451
2476 __check_cap_issue(ci, cap, newcaps); 2452 __check_cap_issue(ci, cap, newcaps);
2477 2453
2478 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { 2454 if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
2455 (issued & CEPH_CAP_AUTH_EXCL) == 0) {
2479 inode->i_mode = le32_to_cpu(grant->mode); 2456 inode->i_mode = le32_to_cpu(grant->mode);
2480 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); 2457 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
2481 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); 2458 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
@@ -2484,7 +2461,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2484 from_kgid(&init_user_ns, inode->i_gid)); 2461 from_kgid(&init_user_ns, inode->i_gid));
2485 } 2462 }
2486 2463
2487 if ((issued & CEPH_CAP_LINK_EXCL) == 0) { 2464 if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
2465 (issued & CEPH_CAP_LINK_EXCL) == 0) {
2488 set_nlink(inode, le32_to_cpu(grant->nlink)); 2466 set_nlink(inode, le32_to_cpu(grant->nlink));
2489 if (inode->i_nlink == 0 && 2467 if (inode->i_nlink == 0 &&
2490 (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) 2468 (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
@@ -2511,30 +2489,35 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2511 if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) 2489 if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
2512 queue_revalidate = 1; 2490 queue_revalidate = 1;
2513 2491
2514 /* size/ctime/mtime/atime? */ 2492 if (newcaps & CEPH_CAP_ANY_RD) {
2515 ceph_fill_file_size(inode, issued, 2493 /* ctime/mtime/atime? */
2516 le32_to_cpu(grant->truncate_seq), 2494 ceph_decode_timespec(&mtime, &grant->mtime);
2517 le64_to_cpu(grant->truncate_size), size); 2495 ceph_decode_timespec(&atime, &grant->atime);
2518 ceph_decode_timespec(&mtime, &grant->mtime); 2496 ceph_decode_timespec(&ctime, &grant->ctime);
2519 ceph_decode_timespec(&atime, &grant->atime); 2497 ceph_fill_file_time(inode, issued,
2520 ceph_decode_timespec(&ctime, &grant->ctime); 2498 le32_to_cpu(grant->time_warp_seq),
2521 ceph_fill_file_time(inode, issued, 2499 &ctime, &mtime, &atime);
2522 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, 2500 }
2523 &atime); 2501
2524 2502 if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
2525 2503 /* file layout may have changed */
2526 /* file layout may have changed */ 2504 ci->i_layout = grant->layout;
2527 ci->i_layout = grant->layout; 2505 /* size/truncate_seq? */
2528 2506 queue_trunc = ceph_fill_file_size(inode, issued,
2529 /* max size increase? */ 2507 le32_to_cpu(grant->truncate_seq),
2530 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { 2508 le64_to_cpu(grant->truncate_size),
2531 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); 2509 size);
2532 ci->i_max_size = max_size; 2510 /* max size increase? */
2533 if (max_size >= ci->i_wanted_max_size) { 2511 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
2534 ci->i_wanted_max_size = 0; /* reset */ 2512 dout("max_size %lld -> %llu\n",
2535 ci->i_requested_max_size = 0; 2513 ci->i_max_size, max_size);
2514 ci->i_max_size = max_size;
2515 if (max_size >= ci->i_wanted_max_size) {
2516 ci->i_wanted_max_size = 0; /* reset */
2517 ci->i_requested_max_size = 0;
2518 }
2519 wake = 1;
2536 } 2520 }
2537 wake = 1;
2538 } 2521 }
2539 2522
2540 /* check cap bits */ 2523 /* check cap bits */
@@ -2595,6 +2578,23 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2595 2578
2596 spin_unlock(&ci->i_ceph_lock); 2579 spin_unlock(&ci->i_ceph_lock);
2597 2580
2581 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
2582 down_write(&mdsc->snap_rwsem);
2583 ceph_update_snap_trace(mdsc, snaptrace,
2584 snaptrace + snaptrace_len, false);
2585 downgrade_write(&mdsc->snap_rwsem);
2586 kick_flushing_inode_caps(mdsc, session, inode);
2587 up_read(&mdsc->snap_rwsem);
2588 if (newcaps & ~issued)
2589 wake = 1;
2590 }
2591
2592 if (queue_trunc) {
2593 ceph_queue_vmtruncate(inode);
2594 ceph_queue_revalidate(inode);
2595 } else if (queue_revalidate)
2596 ceph_queue_revalidate(inode);
2597
2598 if (writeback) 2598 if (writeback)
2599 /* 2599 /*
2600 * queue inode for writeback: we can't actually call 2600 * queue inode for writeback: we can't actually call
@@ -2606,8 +2606,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2606 ceph_queue_invalidate(inode); 2606 ceph_queue_invalidate(inode);
2607 if (deleted_inode) 2607 if (deleted_inode)
2608 invalidate_aliases(inode); 2608 invalidate_aliases(inode);
2609 if (queue_revalidate)
2610 ceph_queue_revalidate(inode);
2611 if (wake) 2609 if (wake)
2612 wake_up_all(&ci->i_cap_wq); 2610 wake_up_all(&ci->i_cap_wq);
2613 2611
@@ -2784,7 +2782,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2784{ 2782{
2785 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2783 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2786 struct ceph_mds_session *tsession = NULL; 2784 struct ceph_mds_session *tsession = NULL;
2787 struct ceph_cap *cap, *tcap; 2785 struct ceph_cap *cap, *tcap, *new_cap = NULL;
2788 struct ceph_inode_info *ci = ceph_inode(inode); 2786 struct ceph_inode_info *ci = ceph_inode(inode);
2789 u64 t_cap_id; 2787 u64 t_cap_id;
2790 unsigned mseq = le32_to_cpu(ex->migrate_seq); 2788 unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2807,7 +2805,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2807retry: 2805retry:
2808 spin_lock(&ci->i_ceph_lock); 2806 spin_lock(&ci->i_ceph_lock);
2809 cap = __get_cap_for_mds(ci, mds); 2807 cap = __get_cap_for_mds(ci, mds);
2810 if (!cap) 2808 if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
2811 goto out_unlock; 2809 goto out_unlock;
2812 2810
2813 if (target < 0) { 2811 if (target < 0) {
@@ -2846,15 +2844,14 @@ retry:
2846 } 2844 }
2847 __ceph_remove_cap(cap, false); 2845 __ceph_remove_cap(cap, false);
2848 goto out_unlock; 2846 goto out_unlock;
2849 } 2847 } else if (tsession) {
2850
2851 if (tsession) {
2852 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
2853 spin_unlock(&ci->i_ceph_lock);
2854 /* add placeholder for the export tagert */ 2848 /* add placeholder for the export tagert */
2849 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
2855 ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, 2850 ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
2856 t_seq - 1, t_mseq, (u64)-1, flag, NULL); 2851 t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
2857 goto retry; 2852
2853 __ceph_remove_cap(cap, false);
2854 goto out_unlock;
2858 } 2855 }
2859 2856
2860 spin_unlock(&ci->i_ceph_lock); 2857 spin_unlock(&ci->i_ceph_lock);
@@ -2873,6 +2870,7 @@ retry:
2873 SINGLE_DEPTH_NESTING); 2870 SINGLE_DEPTH_NESTING);
2874 } 2871 }
2875 ceph_add_cap_releases(mdsc, tsession); 2872 ceph_add_cap_releases(mdsc, tsession);
2873 new_cap = ceph_get_cap(mdsc, NULL);
2876 } else { 2874 } else {
2877 WARN_ON(1); 2875 WARN_ON(1);
2878 tsession = NULL; 2876 tsession = NULL;
@@ -2887,24 +2885,27 @@ out_unlock:
2887 mutex_unlock(&tsession->s_mutex); 2885 mutex_unlock(&tsession->s_mutex);
2888 ceph_put_mds_session(tsession); 2886 ceph_put_mds_session(tsession);
2889 } 2887 }
2888 if (new_cap)
2889 ceph_put_cap(mdsc, new_cap);
2890} 2890}
2891 2891
2892/* 2892/*
2893 * Handle cap IMPORT. If there are temp bits from an older EXPORT, 2893 * Handle cap IMPORT.
2894 * clean them up.
2895 * 2894 *
2896 * caller holds s_mutex. 2895 * caller holds s_mutex. acquires i_ceph_lock
2897 */ 2896 */
2898static void handle_cap_import(struct ceph_mds_client *mdsc, 2897static void handle_cap_import(struct ceph_mds_client *mdsc,
2899 struct inode *inode, struct ceph_mds_caps *im, 2898 struct inode *inode, struct ceph_mds_caps *im,
2900 struct ceph_mds_cap_peer *ph, 2899 struct ceph_mds_cap_peer *ph,
2901 struct ceph_mds_session *session, 2900 struct ceph_mds_session *session,
2902 void *snaptrace, int snaptrace_len) 2901 struct ceph_cap **target_cap, int *old_issued)
2902 __acquires(ci->i_ceph_lock)
2903{ 2903{
2904 struct ceph_inode_info *ci = ceph_inode(inode); 2904 struct ceph_inode_info *ci = ceph_inode(inode);
2905 struct ceph_cap *cap; 2905 struct ceph_cap *cap, *ocap, *new_cap = NULL;
2906 int mds = session->s_mds; 2906 int mds = session->s_mds;
2907 unsigned issued = le32_to_cpu(im->caps); 2907 int issued;
2908 unsigned caps = le32_to_cpu(im->caps);
2908 unsigned wanted = le32_to_cpu(im->wanted); 2909 unsigned wanted = le32_to_cpu(im->wanted);
2909 unsigned seq = le32_to_cpu(im->seq); 2910 unsigned seq = le32_to_cpu(im->seq);
2910 unsigned mseq = le32_to_cpu(im->migrate_seq); 2911 unsigned mseq = le32_to_cpu(im->migrate_seq);
@@ -2924,40 +2925,52 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2924 dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", 2925 dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
2925 inode, ci, mds, mseq, peer); 2926 inode, ci, mds, mseq, peer);
2926 2927
2928retry:
2927 spin_lock(&ci->i_ceph_lock); 2929 spin_lock(&ci->i_ceph_lock);
2928 cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; 2930 cap = __get_cap_for_mds(ci, mds);
2929 if (cap && cap->cap_id == p_cap_id) { 2931 if (!cap) {
2932 if (!new_cap) {
2933 spin_unlock(&ci->i_ceph_lock);
2934 new_cap = ceph_get_cap(mdsc, NULL);
2935 goto retry;
2936 }
2937 cap = new_cap;
2938 } else {
2939 if (new_cap) {
2940 ceph_put_cap(mdsc, new_cap);
2941 new_cap = NULL;
2942 }
2943 }
2944
2945 __ceph_caps_issued(ci, &issued);
2946 issued |= __ceph_caps_dirty(ci);
2947
2948 ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
2949 realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
2950
2951 ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
2952 if (ocap && ocap->cap_id == p_cap_id) {
2930 dout(" remove export cap %p mds%d flags %d\n", 2953 dout(" remove export cap %p mds%d flags %d\n",
2931 cap, peer, ph->flags); 2954 ocap, peer, ph->flags);
2932 if ((ph->flags & CEPH_CAP_FLAG_AUTH) && 2955 if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
2933 (cap->seq != le32_to_cpu(ph->seq) || 2956 (ocap->seq != le32_to_cpu(ph->seq) ||
2934 cap->mseq != le32_to_cpu(ph->mseq))) { 2957 ocap->mseq != le32_to_cpu(ph->mseq))) {
2935 pr_err("handle_cap_import: mismatched seq/mseq: " 2958 pr_err("handle_cap_import: mismatched seq/mseq: "
2936 "ino (%llx.%llx) mds%d seq %d mseq %d " 2959 "ino (%llx.%llx) mds%d seq %d mseq %d "
2937 "importer mds%d has peer seq %d mseq %d\n", 2960 "importer mds%d has peer seq %d mseq %d\n",
2938 ceph_vinop(inode), peer, cap->seq, 2961 ceph_vinop(inode), peer, ocap->seq,
2939 cap->mseq, mds, le32_to_cpu(ph->seq), 2962 ocap->mseq, mds, le32_to_cpu(ph->seq),
2940 le32_to_cpu(ph->mseq)); 2963 le32_to_cpu(ph->mseq));
2941 } 2964 }
2942 ci->i_cap_exporting_issued = cap->issued; 2965 __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
2943 __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
2944 } 2966 }
2945 2967
2946 /* make sure we re-request max_size, if necessary */ 2968 /* make sure we re-request max_size, if necessary */
2947 ci->i_wanted_max_size = 0; 2969 ci->i_wanted_max_size = 0;
2948 ci->i_requested_max_size = 0; 2970 ci->i_requested_max_size = 0;
2949 spin_unlock(&ci->i_ceph_lock);
2950
2951 down_write(&mdsc->snap_rwsem);
2952 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2953 false);
2954 downgrade_write(&mdsc->snap_rwsem);
2955 ceph_add_cap(inode, session, cap_id, -1,
2956 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2957 NULL /* no caps context */);
2958 kick_flushing_inode_caps(mdsc, session, inode);
2959 up_read(&mdsc->snap_rwsem);
2960 2971
2972 *old_issued = issued;
2973 *target_cap = cap;
2961} 2974}
2962 2975
2963/* 2976/*
@@ -2977,7 +2990,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2977 struct ceph_mds_caps *h; 2990 struct ceph_mds_caps *h;
2978 struct ceph_mds_cap_peer *peer = NULL; 2991 struct ceph_mds_cap_peer *peer = NULL;
2979 int mds = session->s_mds; 2992 int mds = session->s_mds;
2980 int op; 2993 int op, issued;
2981 u32 seq, mseq; 2994 u32 seq, mseq;
2982 struct ceph_vino vino; 2995 struct ceph_vino vino;
2983 u64 cap_id; 2996 u64 cap_id;
@@ -3069,7 +3082,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3069 3082
3070 case CEPH_CAP_OP_IMPORT: 3083 case CEPH_CAP_OP_IMPORT:
3071 handle_cap_import(mdsc, inode, h, peer, session, 3084 handle_cap_import(mdsc, inode, h, peer, session,
3072 snaptrace, snaptrace_len); 3085 &cap, &issued);
3086 handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len,
3087 msg->middle, session, cap, issued);
3088 goto done_unlocked;
3073 } 3089 }
3074 3090
3075 /* the rest require a cap */ 3091 /* the rest require a cap */
@@ -3086,8 +3102,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3086 switch (op) { 3102 switch (op) {
3087 case CEPH_CAP_OP_REVOKE: 3103 case CEPH_CAP_OP_REVOKE:
3088 case CEPH_CAP_OP_GRANT: 3104 case CEPH_CAP_OP_GRANT:
3089 case CEPH_CAP_OP_IMPORT: 3105 __ceph_caps_issued(ci, &issued);
3090 handle_cap_grant(inode, h, session, cap, msg->middle); 3106 issued |= __ceph_caps_dirty(ci);
3107 handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle,
3108 session, cap, issued);
3091 goto done_unlocked; 3109 goto done_unlocked;
3092 3110
3093 case CEPH_CAP_OP_FLUSH_ACK: 3111 case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 00d6af6a32ec..8d7d782f4382 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -169,7 +169,7 @@ static struct dentry *__get_parent(struct super_block *sb,
169 return dentry; 169 return dentry;
170} 170}
171 171
172struct dentry *ceph_get_parent(struct dentry *child) 172static struct dentry *ceph_get_parent(struct dentry *child)
173{ 173{
174 /* don't re-export snaps */ 174 /* don't re-export snaps */
175 if (ceph_snap(child->d_inode) != CEPH_NOSNAP) 175 if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 88a6df4cbe6d..302085100c28 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -418,7 +418,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
418 struct page **pages; 418 struct page **pages;
419 u64 off = iocb->ki_pos; 419 u64 off = iocb->ki_pos;
420 int num_pages, ret; 420 int num_pages, ret;
421 size_t len = i->count; 421 size_t len = iov_iter_count(i);
422 422
423 dout("sync_read on file %p %llu~%u %s\n", file, off, 423 dout("sync_read on file %p %llu~%u %s\n", file, off,
424 (unsigned)len, 424 (unsigned)len,
@@ -436,25 +436,26 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
436 436
437 if (file->f_flags & O_DIRECT) { 437 if (file->f_flags & O_DIRECT) {
438 while (iov_iter_count(i)) { 438 while (iov_iter_count(i)) {
439 void __user *data = i->iov[0].iov_base + i->iov_offset; 439 size_t start;
440 size_t len = i->iov[0].iov_len - i->iov_offset; 440 ssize_t n;
441 441
442 num_pages = calc_pages_for((unsigned long)data, len); 442 n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
443 pages = ceph_get_direct_page_vector(data, 443 if (n < 0)
444 num_pages, true); 444 return n;
445 if (IS_ERR(pages))
446 return PTR_ERR(pages);
447 445
448 ret = striped_read(inode, off, len, 446 num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
447
448 ret = striped_read(inode, off, n,
449 pages, num_pages, checkeof, 449 pages, num_pages, checkeof,
450 1, (unsigned long)data & ~PAGE_MASK); 450 1, start);
451
451 ceph_put_page_vector(pages, num_pages, true); 452 ceph_put_page_vector(pages, num_pages, true);
452 453
453 if (ret <= 0) 454 if (ret <= 0)
454 break; 455 break;
455 off += ret; 456 off += ret;
456 iov_iter_advance(i, ret); 457 iov_iter_advance(i, ret);
457 if (ret < len) 458 if (ret < n)
458 break; 459 break;
459 } 460 }
460 } else { 461 } else {
@@ -466,25 +467,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
466 num_pages, checkeof, 0, 0); 467 num_pages, checkeof, 0, 0);
467 if (ret > 0) { 468 if (ret > 0) {
468 int l, k = 0; 469 int l, k = 0;
469 size_t left = len = ret; 470 size_t left = ret;
470 471
471 while (left) { 472 while (left) {
472 void __user *data = i->iov[0].iov_base 473 int copy = min_t(size_t, PAGE_SIZE, left);
473 + i->iov_offset; 474 l = copy_page_to_iter(pages[k++], 0, copy, i);
474 l = min(i->iov[0].iov_len - i->iov_offset, 475 off += l;
475 left); 476 left -= l;
476 477 if (l < copy)
477 ret = ceph_copy_page_vector_to_user(&pages[k],
478 data, off,
479 l);
480 if (ret > 0) {
481 iov_iter_advance(i, ret);
482 left -= ret;
483 off += ret;
484 k = calc_pages_for(iocb->ki_pos,
485 len - left + 1) - 1;
486 BUG_ON(k >= num_pages && left);
487 } else
488 break; 478 break;
489 } 479 }
490 } 480 }
@@ -541,8 +531,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
541 * objects, rollback on failure, etc.) 531 * objects, rollback on failure, etc.)
542 */ 532 */
543static ssize_t 533static ssize_t
544ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, 534ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
545 unsigned long nr_segs, size_t count)
546{ 535{
547 struct file *file = iocb->ki_filp; 536 struct file *file = iocb->ki_filp;
548 struct inode *inode = file_inode(file); 537 struct inode *inode = file_inode(file);
@@ -556,11 +545,10 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
556 int written = 0; 545 int written = 0;
557 int flags; 546 int flags;
558 int check_caps = 0; 547 int check_caps = 0;
559 int page_align;
560 int ret; 548 int ret;
561 struct timespec mtime = CURRENT_TIME; 549 struct timespec mtime = CURRENT_TIME;
562 loff_t pos = iocb->ki_pos; 550 loff_t pos = iocb->ki_pos;
563 struct iov_iter i; 551 size_t count = iov_iter_count(from);
564 552
565 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 553 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
566 return -EROFS; 554 return -EROFS;
@@ -582,13 +570,10 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
582 CEPH_OSD_FLAG_ONDISK | 570 CEPH_OSD_FLAG_ONDISK |
583 CEPH_OSD_FLAG_WRITE; 571 CEPH_OSD_FLAG_WRITE;
584 572
585 iov_iter_init(&i, iov, nr_segs, count, 0); 573 while (iov_iter_count(from) > 0) {
586 574 u64 len = iov_iter_single_seg_count(from);
587 while (iov_iter_count(&i) > 0) { 575 size_t start;
588 void __user *data = i.iov->iov_base + i.iov_offset; 576 ssize_t n;
589 u64 len = i.iov->iov_len - i.iov_offset;
590
591 page_align = (unsigned long)data & ~PAGE_MASK;
592 577
593 snapc = ci->i_snap_realm->cached_context; 578 snapc = ci->i_snap_realm->cached_context;
594 vino = ceph_vino(inode); 579 vino = ceph_vino(inode);
@@ -604,20 +589,21 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
604 break; 589 break;
605 } 590 }
606 591
607 num_pages = calc_pages_for(page_align, len); 592 n = iov_iter_get_pages_alloc(from, &pages, len, &start);
608 pages = ceph_get_direct_page_vector(data, num_pages, false); 593 if (unlikely(n < 0)) {
609 if (IS_ERR(pages)) { 594 ret = n;
610 ret = PTR_ERR(pages); 595 ceph_osdc_put_request(req);
611 goto out; 596 break;
612 } 597 }
613 598
599 num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
614 /* 600 /*
615 * throw out any page cache pages in this range. this 601 * throw out any page cache pages in this range. this
616 * may block. 602 * may block.
617 */ 603 */
618 truncate_inode_pages_range(inode->i_mapping, pos, 604 truncate_inode_pages_range(inode->i_mapping, pos,
619 (pos+len) | (PAGE_CACHE_SIZE-1)); 605 (pos+n) | (PAGE_CACHE_SIZE-1));
620 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, 606 osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
621 false, false); 607 false, false);
622 608
623 /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 609 /* BUG_ON(vino.snap != CEPH_NOSNAP); */
@@ -629,22 +615,20 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
629 615
630 ceph_put_page_vector(pages, num_pages, false); 616 ceph_put_page_vector(pages, num_pages, false);
631 617
632out:
633 ceph_osdc_put_request(req); 618 ceph_osdc_put_request(req);
634 if (ret == 0) { 619 if (ret)
635 pos += len;
636 written += len;
637 iov_iter_advance(&i, (size_t)len);
638
639 if (pos > i_size_read(inode)) {
640 check_caps = ceph_inode_set_size(inode, pos);
641 if (check_caps)
642 ceph_check_caps(ceph_inode(inode),
643 CHECK_CAPS_AUTHONLY,
644 NULL);
645 }
646 } else
647 break; 620 break;
621 pos += n;
622 written += n;
623 iov_iter_advance(from, n);
624
625 if (pos > i_size_read(inode)) {
626 check_caps = ceph_inode_set_size(inode, pos);
627 if (check_caps)
628 ceph_check_caps(ceph_inode(inode),
629 CHECK_CAPS_AUTHONLY,
630 NULL);
631 }
648 } 632 }
649 633
650 if (ret != -EOLDSNAPC && written > 0) { 634 if (ret != -EOLDSNAPC && written > 0) {
@@ -662,8 +646,7 @@ out:
662 * correct atomic write, we should e.g. take write locks on all 646 * correct atomic write, we should e.g. take write locks on all
663 * objects, rollback on failure, etc.) 647 * objects, rollback on failure, etc.)
664 */ 648 */
665static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, 649static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
666 unsigned long nr_segs, size_t count)
667{ 650{
668 struct file *file = iocb->ki_filp; 651 struct file *file = iocb->ki_filp;
669 struct inode *inode = file_inode(file); 652 struct inode *inode = file_inode(file);
@@ -681,7 +664,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
681 int ret; 664 int ret;
682 struct timespec mtime = CURRENT_TIME; 665 struct timespec mtime = CURRENT_TIME;
683 loff_t pos = iocb->ki_pos; 666 loff_t pos = iocb->ki_pos;
684 struct iov_iter i; 667 size_t count = iov_iter_count(from);
685 668
686 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 669 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
687 return -EROFS; 670 return -EROFS;
@@ -703,9 +686,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
703 CEPH_OSD_FLAG_WRITE | 686 CEPH_OSD_FLAG_WRITE |
704 CEPH_OSD_FLAG_ACK; 687 CEPH_OSD_FLAG_ACK;
705 688
706 iov_iter_init(&i, iov, nr_segs, count, 0); 689 while ((len = iov_iter_count(from)) > 0) {
707
708 while ((len = iov_iter_count(&i)) > 0) {
709 size_t left; 690 size_t left;
710 int n; 691 int n;
711 692
@@ -737,13 +718,12 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
737 left = len; 718 left = len;
738 for (n = 0; n < num_pages; n++) { 719 for (n = 0; n < num_pages; n++) {
739 size_t plen = min_t(size_t, left, PAGE_SIZE); 720 size_t plen = min_t(size_t, left, PAGE_SIZE);
740 ret = iov_iter_copy_from_user(pages[n], &i, 0, plen); 721 ret = copy_page_from_iter(pages[n], 0, plen, from);
741 if (ret != plen) { 722 if (ret != plen) {
742 ret = -EFAULT; 723 ret = -EFAULT;
743 break; 724 break;
744 } 725 }
745 left -= ret; 726 left -= ret;
746 iov_iter_advance(&i, ret);
747 } 727 }
748 728
749 if (ret < 0) { 729 if (ret < 0) {
@@ -796,8 +776,7 @@ out:
796 * 776 *
797 * Hmm, the sync read case isn't actually async... should it be? 777 * Hmm, the sync read case isn't actually async... should it be?
798 */ 778 */
799static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, 779static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
800 unsigned long nr_segs, loff_t pos)
801{ 780{
802 struct file *filp = iocb->ki_filp; 781 struct file *filp = iocb->ki_filp;
803 struct ceph_file_info *fi = filp->private_data; 782 struct ceph_file_info *fi = filp->private_data;
@@ -823,40 +802,20 @@ again:
823 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 802 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
824 (iocb->ki_filp->f_flags & O_DIRECT) || 803 (iocb->ki_filp->f_flags & O_DIRECT) ||
825 (fi->flags & CEPH_F_SYNC)) { 804 (fi->flags & CEPH_F_SYNC)) {
826 struct iov_iter i;
827 805
828 dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", 806 dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
829 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 807 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
830 ceph_cap_string(got)); 808 ceph_cap_string(got));
831 809
832 if (!read) {
833 ret = generic_segment_checks(iov, &nr_segs,
834 &len, VERIFY_WRITE);
835 if (ret)
836 goto out;
837 }
838
839 iov_iter_init(&i, iov, nr_segs, len, read);
840
841 /* hmm, this isn't really async... */ 810 /* hmm, this isn't really async... */
842 ret = ceph_sync_read(iocb, &i, &checkeof); 811 ret = ceph_sync_read(iocb, to, &checkeof);
843 } else { 812 } else {
844 /*
845 * We can't modify the content of iov,
846 * so we only read from beginning.
847 */
848 if (read) {
849 iocb->ki_pos = pos;
850 len = iocb->ki_nbytes;
851 read = 0;
852 }
853 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 813 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
854 inode, ceph_vinop(inode), pos, (unsigned)len, 814 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
855 ceph_cap_string(got)); 815 ceph_cap_string(got));
856 816
857 ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 817 ret = generic_file_read_iter(iocb, to);
858 } 818 }
859out:
860 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 819 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
861 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 820 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
862 ceph_put_cap_refs(ci, got); 821 ceph_put_cap_refs(ci, got);
@@ -872,6 +831,7 @@ out:
872 ", reading more\n", iocb->ki_pos, 831 ", reading more\n", iocb->ki_pos,
873 inode->i_size); 832 inode->i_size);
874 833
834 iov_iter_advance(to, ret);
875 read += ret; 835 read += ret;
876 len -= ret; 836 len -= ret;
877 checkeof = 0; 837 checkeof = 0;
@@ -895,8 +855,7 @@ out:
895 * 855 *
896 * If we are near ENOSPC, write synchronously. 856 * If we are near ENOSPC, write synchronously.
897 */ 857 */
898static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, 858static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
899 unsigned long nr_segs, loff_t pos)
900{ 859{
901 struct file *file = iocb->ki_filp; 860 struct file *file = iocb->ki_filp;
902 struct ceph_file_info *fi = file->private_data; 861 struct ceph_file_info *fi = file->private_data;
@@ -904,18 +863,15 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
904 struct ceph_inode_info *ci = ceph_inode(inode); 863 struct ceph_inode_info *ci = ceph_inode(inode);
905 struct ceph_osd_client *osdc = 864 struct ceph_osd_client *osdc =
906 &ceph_sb_to_client(inode->i_sb)->client->osdc; 865 &ceph_sb_to_client(inode->i_sb)->client->osdc;
907 ssize_t count, written = 0; 866 ssize_t count = iov_iter_count(from), written = 0;
908 int err, want, got; 867 int err, want, got;
868 loff_t pos = iocb->ki_pos;
909 869
910 if (ceph_snap(inode) != CEPH_NOSNAP) 870 if (ceph_snap(inode) != CEPH_NOSNAP)
911 return -EROFS; 871 return -EROFS;
912 872
913 mutex_lock(&inode->i_mutex); 873 mutex_lock(&inode->i_mutex);
914 874
915 err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
916 if (err)
917 goto out;
918
919 /* We can write back this queue in page reclaim */ 875 /* We can write back this queue in page reclaim */
920 current->backing_dev_info = file->f_mapping->backing_dev_info; 876 current->backing_dev_info = file->f_mapping->backing_dev_info;
921 877
@@ -925,6 +881,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
925 881
926 if (count == 0) 882 if (count == 0)
927 goto out; 883 goto out;
884 iov_iter_truncate(from, count);
928 885
929 err = file_remove_suid(file); 886 err = file_remove_suid(file);
930 if (err) 887 if (err)
@@ -956,23 +913,26 @@ retry_snap:
956 913
957 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 914 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
958 (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { 915 (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
916 struct iov_iter data;
959 mutex_unlock(&inode->i_mutex); 917 mutex_unlock(&inode->i_mutex);
918 /* we might need to revert back to that point */
919 data = *from;
960 if (file->f_flags & O_DIRECT) 920 if (file->f_flags & O_DIRECT)
961 written = ceph_sync_direct_write(iocb, iov, 921 written = ceph_sync_direct_write(iocb, &data);
962 nr_segs, count);
963 else 922 else
964 written = ceph_sync_write(iocb, iov, nr_segs, count); 923 written = ceph_sync_write(iocb, &data);
965 if (written == -EOLDSNAPC) { 924 if (written == -EOLDSNAPC) {
966 dout("aio_write %p %llx.%llx %llu~%u" 925 dout("aio_write %p %llx.%llx %llu~%u"
967 "got EOLDSNAPC, retrying\n", 926 "got EOLDSNAPC, retrying\n",
968 inode, ceph_vinop(inode), 927 inode, ceph_vinop(inode),
969 pos, (unsigned)iov->iov_len); 928 pos, (unsigned)count);
970 mutex_lock(&inode->i_mutex); 929 mutex_lock(&inode->i_mutex);
971 goto retry_snap; 930 goto retry_snap;
972 } 931 }
932 if (written > 0)
933 iov_iter_advance(from, written);
973 } else { 934 } else {
974 loff_t old_size = inode->i_size; 935 loff_t old_size = inode->i_size;
975 struct iov_iter from;
976 /* 936 /*
977 * No need to acquire the i_truncate_mutex. Because 937 * No need to acquire the i_truncate_mutex. Because
978 * the MDS revokes Fwb caps before sending truncate 938 * the MDS revokes Fwb caps before sending truncate
@@ -980,8 +940,7 @@ retry_snap:
980 * are pending vmtruncate. So write and vmtruncate 940 * are pending vmtruncate. So write and vmtruncate
981 * can not run at the same time 941 * can not run at the same time
982 */ 942 */
983 iov_iter_init(&from, iov, nr_segs, count, 0); 943 written = generic_perform_write(file, from, pos);
984 written = generic_perform_write(file, &from, pos);
985 if (likely(written >= 0)) 944 if (likely(written >= 0))
986 iocb->ki_pos = pos + written; 945 iocb->ki_pos = pos + written;
987 if (inode->i_size > old_size) 946 if (inode->i_size > old_size)
@@ -999,7 +958,7 @@ retry_snap:
999 } 958 }
1000 959
1001 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 960 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
1002 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 961 inode, ceph_vinop(inode), pos, (unsigned)count,
1003 ceph_cap_string(got)); 962 ceph_cap_string(got));
1004 ceph_put_cap_refs(ci, got); 963 ceph_put_cap_refs(ci, got);
1005 964
@@ -1276,16 +1235,16 @@ const struct file_operations ceph_file_fops = {
1276 .open = ceph_open, 1235 .open = ceph_open,
1277 .release = ceph_release, 1236 .release = ceph_release,
1278 .llseek = ceph_llseek, 1237 .llseek = ceph_llseek,
1279 .read = do_sync_read, 1238 .read = new_sync_read,
1280 .write = do_sync_write, 1239 .write = new_sync_write,
1281 .aio_read = ceph_aio_read, 1240 .read_iter = ceph_read_iter,
1282 .aio_write = ceph_aio_write, 1241 .write_iter = ceph_write_iter,
1283 .mmap = ceph_mmap, 1242 .mmap = ceph_mmap,
1284 .fsync = ceph_fsync, 1243 .fsync = ceph_fsync,
1285 .lock = ceph_lock, 1244 .lock = ceph_lock,
1286 .flock = ceph_flock, 1245 .flock = ceph_flock,
1287 .splice_read = generic_file_splice_read, 1246 .splice_read = generic_file_splice_read,
1288 .splice_write = generic_file_splice_write, 1247 .splice_write = iter_file_splice_write,
1289 .unlocked_ioctl = ceph_ioctl, 1248 .unlocked_ioctl = ceph_ioctl,
1290 .compat_ioctl = ceph_ioctl, 1249 .compat_ioctl = ceph_ioctl,
1291 .fallocate = ceph_fallocate, 1250 .fallocate = ceph_fallocate,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e4fff9ff1c27..04c89c266cec 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -10,6 +10,7 @@
10#include <linux/writeback.h> 10#include <linux/writeback.h>
11#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include <linux/posix_acl.h> 12#include <linux/posix_acl.h>
13#include <linux/random.h>
13 14
14#include "super.h" 15#include "super.h"
15#include "mds_client.h" 16#include "mds_client.h"
@@ -179,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
179 * specified, copy the frag delegation info to the caller if 180 * specified, copy the frag delegation info to the caller if
180 * it is present. 181 * it is present.
181 */ 182 */
182u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 183static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
183 struct ceph_inode_frag *pfrag, 184 struct ceph_inode_frag *pfrag, int *found)
184 int *found)
185{ 185{
186 u32 t = ceph_frag_make(0, 0); 186 u32 t = ceph_frag_make(0, 0);
187 struct ceph_inode_frag *frag; 187 struct ceph_inode_frag *frag;
@@ -191,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
191 if (found) 191 if (found)
192 *found = 0; 192 *found = 0;
193 193
194 mutex_lock(&ci->i_fragtree_mutex);
195 while (1) { 194 while (1) {
196 WARN_ON(!ceph_frag_contains_value(t, v)); 195 WARN_ON(!ceph_frag_contains_value(t, v));
197 frag = __ceph_find_frag(ci, t); 196 frag = __ceph_find_frag(ci, t);
@@ -220,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
220 } 219 }
221 dout("choose_frag(%x) = %x\n", v, t); 220 dout("choose_frag(%x) = %x\n", v, t);
222 221
223 mutex_unlock(&ci->i_fragtree_mutex);
224 return t; 222 return t;
225} 223}
226 224
225u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
226 struct ceph_inode_frag *pfrag, int *found)
227{
228 u32 ret;
229 mutex_lock(&ci->i_fragtree_mutex);
230 ret = __ceph_choose_frag(ci, v, pfrag, found);
231 mutex_unlock(&ci->i_fragtree_mutex);
232 return ret;
233}
234
227/* 235/*
228 * Process dirfrag (delegation) info from the mds. Include leaf 236 * Process dirfrag (delegation) info from the mds. Include leaf
229 * fragment in tree ONLY if ndist > 0. Otherwise, only 237 * fragment in tree ONLY if ndist > 0. Otherwise, only
@@ -237,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode,
237 u32 id = le32_to_cpu(dirinfo->frag); 245 u32 id = le32_to_cpu(dirinfo->frag);
238 int mds = le32_to_cpu(dirinfo->auth); 246 int mds = le32_to_cpu(dirinfo->auth);
239 int ndist = le32_to_cpu(dirinfo->ndist); 247 int ndist = le32_to_cpu(dirinfo->ndist);
248 int diri_auth = -1;
240 int i; 249 int i;
241 int err = 0; 250 int err = 0;
242 251
252 spin_lock(&ci->i_ceph_lock);
253 if (ci->i_auth_cap)
254 diri_auth = ci->i_auth_cap->mds;
255 spin_unlock(&ci->i_ceph_lock);
256
243 mutex_lock(&ci->i_fragtree_mutex); 257 mutex_lock(&ci->i_fragtree_mutex);
244 if (ndist == 0) { 258 if (ndist == 0 && mds == diri_auth) {
245 /* no delegation info needed. */ 259 /* no delegation info needed. */
246 frag = __ceph_find_frag(ci, id); 260 frag = __ceph_find_frag(ci, id);
247 if (!frag) 261 if (!frag)
@@ -286,6 +300,75 @@ out:
286 return err; 300 return err;
287} 301}
288 302
303static int ceph_fill_fragtree(struct inode *inode,
304 struct ceph_frag_tree_head *fragtree,
305 struct ceph_mds_reply_dirfrag *dirinfo)
306{
307 struct ceph_inode_info *ci = ceph_inode(inode);
308 struct ceph_inode_frag *frag;
309 struct rb_node *rb_node;
310 int i;
311 u32 id, nsplits;
312 bool update = false;
313
314 mutex_lock(&ci->i_fragtree_mutex);
315 nsplits = le32_to_cpu(fragtree->nsplits);
316 if (nsplits) {
317 i = prandom_u32() % nsplits;
318 id = le32_to_cpu(fragtree->splits[i].frag);
319 if (!__ceph_find_frag(ci, id))
320 update = true;
321 } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
322 rb_node = rb_first(&ci->i_fragtree);
323 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
324 if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
325 update = true;
326 }
327 if (!update && dirinfo) {
328 id = le32_to_cpu(dirinfo->frag);
329 if (id != __ceph_choose_frag(ci, id, NULL, NULL))
330 update = true;
331 }
332 if (!update)
333 goto out_unlock;
334
335 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
336 rb_node = rb_first(&ci->i_fragtree);
337 for (i = 0; i < nsplits; i++) {
338 id = le32_to_cpu(fragtree->splits[i].frag);
339 frag = NULL;
340 while (rb_node) {
341 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
342 if (ceph_frag_compare(frag->frag, id) >= 0) {
343 if (frag->frag != id)
344 frag = NULL;
345 else
346 rb_node = rb_next(rb_node);
347 break;
348 }
349 rb_node = rb_next(rb_node);
350 rb_erase(&frag->node, &ci->i_fragtree);
351 kfree(frag);
352 frag = NULL;
353 }
354 if (!frag) {
355 frag = __get_or_create_frag(ci, id);
356 if (IS_ERR(frag))
357 continue;
358 }
359 frag->split_by = le32_to_cpu(fragtree->splits[i].by);
360 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
361 }
362 while (rb_node) {
363 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
364 rb_node = rb_next(rb_node);
365 rb_erase(&frag->node, &ci->i_fragtree);
366 kfree(frag);
367 }
368out_unlock:
369 mutex_unlock(&ci->i_fragtree_mutex);
370 return 0;
371}
289 372
290/* 373/*
291 * initialize a newly allocated inode. 374 * initialize a newly allocated inode.
@@ -341,7 +424,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
341 INIT_LIST_HEAD(&ci->i_cap_snaps); 424 INIT_LIST_HEAD(&ci->i_cap_snaps);
342 ci->i_head_snapc = NULL; 425 ci->i_head_snapc = NULL;
343 ci->i_snap_caps = 0; 426 ci->i_snap_caps = 0;
344 ci->i_cap_exporting_issued = 0;
345 427
346 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 428 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
347 ci->i_nr_by_mode[i] = 0; 429 ci->i_nr_by_mode[i] = 0;
@@ -407,7 +489,7 @@ void ceph_destroy_inode(struct inode *inode)
407 489
408 /* 490 /*
409 * we may still have a snap_realm reference if there are stray 491 * we may still have a snap_realm reference if there are stray
410 * caps in i_cap_exporting_issued or i_snap_caps. 492 * caps in i_snap_caps.
411 */ 493 */
412 if (ci->i_snap_realm) { 494 if (ci->i_snap_realm) {
413 struct ceph_mds_client *mdsc = 495 struct ceph_mds_client *mdsc =
@@ -582,22 +664,26 @@ static int fill_inode(struct inode *inode,
582 unsigned long ttl_from, int cap_fmode, 664 unsigned long ttl_from, int cap_fmode,
583 struct ceph_cap_reservation *caps_reservation) 665 struct ceph_cap_reservation *caps_reservation)
584{ 666{
667 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
585 struct ceph_mds_reply_inode *info = iinfo->in; 668 struct ceph_mds_reply_inode *info = iinfo->in;
586 struct ceph_inode_info *ci = ceph_inode(inode); 669 struct ceph_inode_info *ci = ceph_inode(inode);
587 int i; 670 int issued = 0, implemented, new_issued;
588 int issued = 0, implemented;
589 struct timespec mtime, atime, ctime; 671 struct timespec mtime, atime, ctime;
590 u32 nsplits;
591 struct ceph_inode_frag *frag;
592 struct rb_node *rb_node;
593 struct ceph_buffer *xattr_blob = NULL; 672 struct ceph_buffer *xattr_blob = NULL;
673 struct ceph_cap *new_cap = NULL;
594 int err = 0; 674 int err = 0;
595 int queue_trunc = 0; 675 bool wake = false;
676 bool queue_trunc = false;
677 bool new_version = false;
596 678
597 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", 679 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
598 inode, ceph_vinop(inode), le64_to_cpu(info->version), 680 inode, ceph_vinop(inode), le64_to_cpu(info->version),
599 ci->i_version); 681 ci->i_version);
600 682
683 /* prealloc new cap struct */
684 if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
685 new_cap = ceph_get_cap(mdsc, caps_reservation);
686
601 /* 687 /*
602 * prealloc xattr data, if it looks like we'll need it. only 688 * prealloc xattr data, if it looks like we'll need it. only
603 * if len > 4 (meaning there are actually xattrs; the first 4 689 * if len > 4 (meaning there are actually xattrs; the first 4
@@ -623,19 +709,23 @@ static int fill_inode(struct inode *inode,
623 * 3 2 skip 709 * 3 2 skip
624 * 3 3 update 710 * 3 3 update
625 */ 711 */
626 if (le64_to_cpu(info->version) > 0 && 712 if (ci->i_version == 0 ||
627 (ci->i_version & ~1) >= le64_to_cpu(info->version)) 713 ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
628 goto no_change; 714 le64_to_cpu(info->version) > (ci->i_version & ~1)))
629 715 new_version = true;
716
630 issued = __ceph_caps_issued(ci, &implemented); 717 issued = __ceph_caps_issued(ci, &implemented);
631 issued |= implemented | __ceph_caps_dirty(ci); 718 issued |= implemented | __ceph_caps_dirty(ci);
719 new_issued = ~issued & le32_to_cpu(info->cap.caps);
632 720
633 /* update inode */ 721 /* update inode */
634 ci->i_version = le64_to_cpu(info->version); 722 ci->i_version = le64_to_cpu(info->version);
635 inode->i_version++; 723 inode->i_version++;
636 inode->i_rdev = le32_to_cpu(info->rdev); 724 inode->i_rdev = le32_to_cpu(info->rdev);
725 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
637 726
638 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { 727 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
728 (issued & CEPH_CAP_AUTH_EXCL) == 0) {
639 inode->i_mode = le32_to_cpu(info->mode); 729 inode->i_mode = le32_to_cpu(info->mode);
640 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); 730 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
641 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); 731 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
@@ -644,23 +734,35 @@ static int fill_inode(struct inode *inode,
644 from_kgid(&init_user_ns, inode->i_gid)); 734 from_kgid(&init_user_ns, inode->i_gid));
645 } 735 }
646 736
647 if ((issued & CEPH_CAP_LINK_EXCL) == 0) 737 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
738 (issued & CEPH_CAP_LINK_EXCL) == 0)
648 set_nlink(inode, le32_to_cpu(info->nlink)); 739 set_nlink(inode, le32_to_cpu(info->nlink));
649 740
650 /* be careful with mtime, atime, size */ 741 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
651 ceph_decode_timespec(&atime, &info->atime); 742 /* be careful with mtime, atime, size */
652 ceph_decode_timespec(&mtime, &info->mtime); 743 ceph_decode_timespec(&atime, &info->atime);
653 ceph_decode_timespec(&ctime, &info->ctime); 744 ceph_decode_timespec(&mtime, &info->mtime);
654 queue_trunc = ceph_fill_file_size(inode, issued, 745 ceph_decode_timespec(&ctime, &info->ctime);
655 le32_to_cpu(info->truncate_seq), 746 ceph_fill_file_time(inode, issued,
656 le64_to_cpu(info->truncate_size), 747 le32_to_cpu(info->time_warp_seq),
657 le64_to_cpu(info->size)); 748 &ctime, &mtime, &atime);
658 ceph_fill_file_time(inode, issued, 749 }
659 le32_to_cpu(info->time_warp_seq), 750
660 &ctime, &mtime, &atime); 751 if (new_version ||
661 752 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
662 ci->i_layout = info->layout; 753 ci->i_layout = info->layout;
663 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 754 queue_trunc = ceph_fill_file_size(inode, issued,
755 le32_to_cpu(info->truncate_seq),
756 le64_to_cpu(info->truncate_size),
757 le64_to_cpu(info->size));
758 /* only update max_size on auth cap */
759 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
760 ci->i_max_size != le64_to_cpu(info->max_size)) {
761 dout("max_size %lld -> %llu\n", ci->i_max_size,
762 le64_to_cpu(info->max_size));
763 ci->i_max_size = le64_to_cpu(info->max_size);
764 }
765 }
664 766
665 /* xattrs */ 767 /* xattrs */
666 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ 768 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
@@ -745,58 +847,6 @@ static int fill_inode(struct inode *inode,
745 dout(" marking %p complete (empty)\n", inode); 847 dout(" marking %p complete (empty)\n", inode);
746 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); 848 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
747 } 849 }
748no_change:
749 /* only update max_size on auth cap */
750 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
751 ci->i_max_size != le64_to_cpu(info->max_size)) {
752 dout("max_size %lld -> %llu\n", ci->i_max_size,
753 le64_to_cpu(info->max_size));
754 ci->i_max_size = le64_to_cpu(info->max_size);
755 }
756
757 spin_unlock(&ci->i_ceph_lock);
758
759 /* queue truncate if we saw i_size decrease */
760 if (queue_trunc)
761 ceph_queue_vmtruncate(inode);
762
763 /* populate frag tree */
764 /* FIXME: move me up, if/when version reflects fragtree changes */
765 nsplits = le32_to_cpu(info->fragtree.nsplits);
766 mutex_lock(&ci->i_fragtree_mutex);
767 rb_node = rb_first(&ci->i_fragtree);
768 for (i = 0; i < nsplits; i++) {
769 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
770 frag = NULL;
771 while (rb_node) {
772 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
773 if (ceph_frag_compare(frag->frag, id) >= 0) {
774 if (frag->frag != id)
775 frag = NULL;
776 else
777 rb_node = rb_next(rb_node);
778 break;
779 }
780 rb_node = rb_next(rb_node);
781 rb_erase(&frag->node, &ci->i_fragtree);
782 kfree(frag);
783 frag = NULL;
784 }
785 if (!frag) {
786 frag = __get_or_create_frag(ci, id);
787 if (IS_ERR(frag))
788 continue;
789 }
790 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
791 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
792 }
793 while (rb_node) {
794 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
795 rb_node = rb_next(rb_node);
796 rb_erase(&frag->node, &ci->i_fragtree);
797 kfree(frag);
798 }
799 mutex_unlock(&ci->i_fragtree_mutex);
800 850
801 /* were we issued a capability? */ 851 /* were we issued a capability? */
802 if (info->cap.caps) { 852 if (info->cap.caps) {
@@ -809,30 +859,41 @@ no_change:
809 le32_to_cpu(info->cap.seq), 859 le32_to_cpu(info->cap.seq),
810 le32_to_cpu(info->cap.mseq), 860 le32_to_cpu(info->cap.mseq),
811 le64_to_cpu(info->cap.realm), 861 le64_to_cpu(info->cap.realm),
812 info->cap.flags, 862 info->cap.flags, &new_cap);
813 caps_reservation); 863 wake = true;
814 } else { 864 } else {
815 spin_lock(&ci->i_ceph_lock);
816 dout(" %p got snap_caps %s\n", inode, 865 dout(" %p got snap_caps %s\n", inode,
817 ceph_cap_string(le32_to_cpu(info->cap.caps))); 866 ceph_cap_string(le32_to_cpu(info->cap.caps)));
818 ci->i_snap_caps |= le32_to_cpu(info->cap.caps); 867 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
819 if (cap_fmode >= 0) 868 if (cap_fmode >= 0)
820 __ceph_get_fmode(ci, cap_fmode); 869 __ceph_get_fmode(ci, cap_fmode);
821 spin_unlock(&ci->i_ceph_lock);
822 } 870 }
823 } else if (cap_fmode >= 0) { 871 } else if (cap_fmode >= 0) {
824 pr_warn("mds issued no caps on %llx.%llx\n", 872 pr_warn("mds issued no caps on %llx.%llx\n",
825 ceph_vinop(inode)); 873 ceph_vinop(inode));
826 __ceph_get_fmode(ci, cap_fmode); 874 __ceph_get_fmode(ci, cap_fmode);
827 } 875 }
876 spin_unlock(&ci->i_ceph_lock);
877
878 if (wake)
879 wake_up_all(&ci->i_cap_wq);
880
881 /* queue truncate if we saw i_size decrease */
882 if (queue_trunc)
883 ceph_queue_vmtruncate(inode);
884
885 /* populate frag tree */
886 if (S_ISDIR(inode->i_mode))
887 ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
828 888
829 /* update delegation info? */ 889 /* update delegation info? */
830 if (dirinfo) 890 if (dirinfo)
831 ceph_fill_dirfrag(inode, dirinfo); 891 ceph_fill_dirfrag(inode, dirinfo);
832 892
833 err = 0; 893 err = 0;
834
835out: 894out:
895 if (new_cap)
896 ceph_put_cap(mdsc, new_cap);
836 if (xattr_blob) 897 if (xattr_blob)
837 ceph_buffer_put(xattr_blob); 898 ceph_buffer_put(xattr_blob);
838 return err; 899 return err;
@@ -1485,7 +1546,7 @@ static void ceph_invalidate_work(struct work_struct *work)
1485 orig_gen = ci->i_rdcache_gen; 1546 orig_gen = ci->i_rdcache_gen;
1486 spin_unlock(&ci->i_ceph_lock); 1547 spin_unlock(&ci->i_ceph_lock);
1487 1548
1488 truncate_inode_pages(inode->i_mapping, 0); 1549 truncate_pagecache(inode, 0);
1489 1550
1490 spin_lock(&ci->i_ceph_lock); 1551 spin_lock(&ci->i_ceph_lock);
1491 if (orig_gen == ci->i_rdcache_gen && 1552 if (orig_gen == ci->i_rdcache_gen &&
@@ -1588,7 +1649,7 @@ retry:
1588 ci->i_truncate_pending, to); 1649 ci->i_truncate_pending, to);
1589 spin_unlock(&ci->i_ceph_lock); 1650 spin_unlock(&ci->i_ceph_lock);
1590 1651
1591 truncate_inode_pages(inode->i_mapping, to); 1652 truncate_pagecache(inode, to);
1592 1653
1593 spin_lock(&ci->i_ceph_lock); 1654 spin_lock(&ci->i_ceph_lock);
1594 if (to == ci->i_truncate_size) { 1655 if (to == ci->i_truncate_size) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9a33b98cb000..92a2548278fc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1558,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1558 init_completion(&req->r_safe_completion); 1558 init_completion(&req->r_safe_completion);
1559 INIT_LIST_HEAD(&req->r_unsafe_item); 1559 INIT_LIST_HEAD(&req->r_unsafe_item);
1560 1560
1561 req->r_stamp = CURRENT_TIME;
1562
1561 req->r_op = op; 1563 req->r_op = op;
1562 req->r_direct_mode = mode; 1564 req->r_direct_mode = mode;
1563 return req; 1565 return req;
@@ -1783,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1783 } 1785 }
1784 1786
1785 len = sizeof(*head) + 1787 len = sizeof(*head) +
1786 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); 1788 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
1789 sizeof(struct timespec);
1787 1790
1788 /* calculate (max) length for cap releases */ 1791 /* calculate (max) length for cap releases */
1789 len += sizeof(struct ceph_mds_request_release) * 1792 len += sizeof(struct ceph_mds_request_release) *
@@ -1800,6 +1803,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1800 goto out_free2; 1803 goto out_free2;
1801 } 1804 }
1802 1805
1806 msg->hdr.version = 2;
1803 msg->hdr.tid = cpu_to_le64(req->r_tid); 1807 msg->hdr.tid = cpu_to_le64(req->r_tid);
1804 1808
1805 head = msg->front.iov_base; 1809 head = msg->front.iov_base;
@@ -1836,6 +1840,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1836 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); 1840 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1837 head->num_releases = cpu_to_le16(releases); 1841 head->num_releases = cpu_to_le16(releases);
1838 1842
1843 /* time stamp */
1844 ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
1845
1839 BUG_ON(p > end); 1846 BUG_ON(p > end);
1840 msg->front.iov_len = p - msg->front.iov_base; 1847 msg->front.iov_len = p - msg->front.iov_base;
1841 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1848 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e90cfccf93bd..e00737cf523c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -194,6 +194,7 @@ struct ceph_mds_request {
194 int r_fmode; /* file mode, if expecting cap */ 194 int r_fmode; /* file mode, if expecting cap */
195 kuid_t r_uid; 195 kuid_t r_uid;
196 kgid_t r_gid; 196 kgid_t r_gid;
197 struct timespec r_stamp;
197 198
198 /* for choosing which mds to send this request to */ 199 /* for choosing which mds to send this request to */
199 int r_direct_mode; 200 int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ead05cc1f447..12b20744e386 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -292,7 +292,6 @@ struct ceph_inode_info {
292 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or 292 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
293 dirty|flushing caps */ 293 dirty|flushing caps */
294 unsigned i_snap_caps; /* cap bits for snapped files */ 294 unsigned i_snap_caps; /* cap bits for snapped files */
295 unsigned i_cap_exporting_issued;
296 295
297 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 296 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
298 297
@@ -775,11 +774,13 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
775extern const char *ceph_cap_string(int c); 774extern const char *ceph_cap_string(int c);
776extern void ceph_handle_caps(struct ceph_mds_session *session, 775extern void ceph_handle_caps(struct ceph_mds_session *session,
777 struct ceph_msg *msg); 776 struct ceph_msg *msg);
778extern int ceph_add_cap(struct inode *inode, 777extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
779 struct ceph_mds_session *session, u64 cap_id, 778 struct ceph_cap_reservation *ctx);
780 int fmode, unsigned issued, unsigned wanted, 779extern void ceph_add_cap(struct inode *inode,
781 unsigned cap, unsigned seq, u64 realmino, int flags, 780 struct ceph_mds_session *session, u64 cap_id,
782 struct ceph_cap_reservation *caps_reservation); 781 int fmode, unsigned issued, unsigned wanted,
782 unsigned cap, unsigned seq, u64 realmino, int flags,
783 struct ceph_cap **new_cap);
783extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); 784extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
784extern void ceph_put_cap(struct ceph_mds_client *mdsc, 785extern void ceph_put_cap(struct ceph_mds_client *mdsc,
785 struct ceph_cap *cap); 786 struct ceph_cap *cap);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 0227b45ef00a..15e9505aa35f 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -290,7 +290,8 @@ int
290cifsConvertToUTF16(__le16 *target, const char *source, int srclen, 290cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
291 const struct nls_table *cp, int mapChars) 291 const struct nls_table *cp, int mapChars)
292{ 292{
293 int i, j, charlen; 293 int i, charlen;
294 int j = 0;
294 char src_char; 295 char src_char;
295 __le16 dst_char; 296 __le16 dst_char;
296 wchar_t tmp; 297 wchar_t tmp;
@@ -298,12 +299,11 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
298 if (!mapChars) 299 if (!mapChars)
299 return cifs_strtoUTF16(target, source, PATH_MAX, cp); 300 return cifs_strtoUTF16(target, source, PATH_MAX, cp);
300 301
301 for (i = 0, j = 0; i < srclen; j++) { 302 for (i = 0; i < srclen; j++) {
302 src_char = source[i]; 303 src_char = source[i];
303 charlen = 1; 304 charlen = 1;
304 switch (src_char) { 305 switch (src_char) {
305 case 0: 306 case 0:
306 put_unaligned(0, &target[j]);
307 goto ctoUTF16_out; 307 goto ctoUTF16_out;
308 case ':': 308 case ':':
309 dst_char = cpu_to_le16(UNI_COLON); 309 dst_char = cpu_to_le16(UNI_COLON);
@@ -350,6 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
350 } 350 }
351 351
352ctoUTF16_out: 352ctoUTF16_out:
353 put_unaligned(0, &target[j]); /* Null terminate target unicode string */
353 return j; 354 return j;
354} 355}
355 356
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6aaa8112c538..888398067420 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -725,8 +725,20 @@ out_nls:
725 goto out; 725 goto out;
726} 726}
727 727
728static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 728static ssize_t
729 unsigned long nr_segs, loff_t pos) 729cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
730{
731 ssize_t rc;
732 struct inode *inode = file_inode(iocb->ki_filp);
733
734 rc = cifs_revalidate_mapping(inode);
735 if (rc)
736 return rc;
737
738 return generic_file_read_iter(iocb, iter);
739}
740
741static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
730{ 742{
731 struct inode *inode = file_inode(iocb->ki_filp); 743 struct inode *inode = file_inode(iocb->ki_filp);
732 struct cifsInodeInfo *cinode = CIFS_I(inode); 744 struct cifsInodeInfo *cinode = CIFS_I(inode);
@@ -737,14 +749,14 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
737 if (written) 749 if (written)
738 return written; 750 return written;
739 751
740 written = generic_file_aio_write(iocb, iov, nr_segs, pos); 752 written = generic_file_write_iter(iocb, from);
741 753
742 if (CIFS_CACHE_WRITE(CIFS_I(inode))) 754 if (CIFS_CACHE_WRITE(CIFS_I(inode)))
743 goto out; 755 goto out;
744 756
745 rc = filemap_fdatawrite(inode->i_mapping); 757 rc = filemap_fdatawrite(inode->i_mapping);
746 if (rc) 758 if (rc)
747 cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", 759 cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
748 rc, inode); 760 rc, inode);
749 761
750out: 762out:
@@ -880,10 +892,10 @@ const struct inode_operations cifs_symlink_inode_ops = {
880}; 892};
881 893
882const struct file_operations cifs_file_ops = { 894const struct file_operations cifs_file_ops = {
883 .read = do_sync_read, 895 .read = new_sync_read,
884 .write = do_sync_write, 896 .write = new_sync_write,
885 .aio_read = generic_file_aio_read, 897 .read_iter = cifs_loose_read_iter,
886 .aio_write = cifs_file_aio_write, 898 .write_iter = cifs_file_write_iter,
887 .open = cifs_open, 899 .open = cifs_open,
888 .release = cifs_close, 900 .release = cifs_close,
889 .lock = cifs_lock, 901 .lock = cifs_lock,
@@ -899,10 +911,10 @@ const struct file_operations cifs_file_ops = {
899}; 911};
900 912
901const struct file_operations cifs_file_strict_ops = { 913const struct file_operations cifs_file_strict_ops = {
902 .read = do_sync_read, 914 .read = new_sync_read,
903 .write = do_sync_write, 915 .write = new_sync_write,
904 .aio_read = cifs_strict_readv, 916 .read_iter = cifs_strict_readv,
905 .aio_write = cifs_strict_writev, 917 .write_iter = cifs_strict_writev,
906 .open = cifs_open, 918 .open = cifs_open,
907 .release = cifs_close, 919 .release = cifs_close,
908 .lock = cifs_lock, 920 .lock = cifs_lock,
@@ -919,10 +931,10 @@ const struct file_operations cifs_file_strict_ops = {
919 931
920const struct file_operations cifs_file_direct_ops = { 932const struct file_operations cifs_file_direct_ops = {
921 /* BB reevaluate whether they can be done with directio, no cache */ 933 /* BB reevaluate whether they can be done with directio, no cache */
922 .read = do_sync_read, 934 .read = new_sync_read,
923 .write = do_sync_write, 935 .write = new_sync_write,
924 .aio_read = cifs_user_readv, 936 .read_iter = cifs_user_readv,
925 .aio_write = cifs_user_writev, 937 .write_iter = cifs_user_writev,
926 .open = cifs_open, 938 .open = cifs_open,
927 .release = cifs_close, 939 .release = cifs_close,
928 .lock = cifs_lock, 940 .lock = cifs_lock,
@@ -938,10 +950,10 @@ const struct file_operations cifs_file_direct_ops = {
938}; 950};
939 951
940const struct file_operations cifs_file_nobrl_ops = { 952const struct file_operations cifs_file_nobrl_ops = {
941 .read = do_sync_read, 953 .read = new_sync_read,
942 .write = do_sync_write, 954 .write = new_sync_write,
943 .aio_read = generic_file_aio_read, 955 .read_iter = cifs_loose_read_iter,
944 .aio_write = cifs_file_aio_write, 956 .write_iter = cifs_file_write_iter,
945 .open = cifs_open, 957 .open = cifs_open,
946 .release = cifs_close, 958 .release = cifs_close,
947 .fsync = cifs_fsync, 959 .fsync = cifs_fsync,
@@ -956,10 +968,10 @@ const struct file_operations cifs_file_nobrl_ops = {
956}; 968};
957 969
958const struct file_operations cifs_file_strict_nobrl_ops = { 970const struct file_operations cifs_file_strict_nobrl_ops = {
959 .read = do_sync_read, 971 .read = new_sync_read,
960 .write = do_sync_write, 972 .write = new_sync_write,
961 .aio_read = cifs_strict_readv, 973 .read_iter = cifs_strict_readv,
962 .aio_write = cifs_strict_writev, 974 .write_iter = cifs_strict_writev,
963 .open = cifs_open, 975 .open = cifs_open,
964 .release = cifs_close, 976 .release = cifs_close,
965 .fsync = cifs_strict_fsync, 977 .fsync = cifs_strict_fsync,
@@ -975,10 +987,10 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
975 987
976const struct file_operations cifs_file_direct_nobrl_ops = { 988const struct file_operations cifs_file_direct_nobrl_ops = {
977 /* BB reevaluate whether they can be done with directio, no cache */ 989 /* BB reevaluate whether they can be done with directio, no cache */
978 .read = do_sync_read, 990 .read = new_sync_read,
979 .write = do_sync_write, 991 .write = new_sync_write,
980 .aio_read = cifs_user_readv, 992 .read_iter = cifs_user_readv,
981 .aio_write = cifs_user_writev, 993 .write_iter = cifs_user_writev,
982 .open = cifs_open, 994 .open = cifs_open,
983 .release = cifs_close, 995 .release = cifs_close,
984 .fsync = cifs_fsync, 996 .fsync = cifs_fsync,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 8fe51166d6e3..70f178a7c759 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -95,14 +95,10 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
95extern int cifs_open(struct inode *inode, struct file *file); 95extern int cifs_open(struct inode *inode, struct file *file);
96extern int cifs_close(struct inode *inode, struct file *file); 96extern int cifs_close(struct inode *inode, struct file *file);
97extern int cifs_closedir(struct inode *inode, struct file *file); 97extern int cifs_closedir(struct inode *inode, struct file *file);
98extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, 98extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
99 unsigned long nr_segs, loff_t pos); 99extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
100extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, 100extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
101 unsigned long nr_segs, loff_t pos); 101extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
102extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
103 unsigned long nr_segs, loff_t pos);
104extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
105 unsigned long nr_segs, loff_t pos);
106extern int cifs_lock(struct file *, int, struct file_lock *); 102extern int cifs_lock(struct file *, int, struct file_lock *);
107extern int cifs_fsync(struct file *, loff_t, loff_t, int); 103extern int cifs_fsync(struct file *, loff_t, loff_t, int);
108extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); 104extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 208f56eca4bf..e90a1e9aa627 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2385,14 +2385,12 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata)
2385} 2385}
2386 2386
2387static ssize_t 2387static ssize_t
2388cifs_iovec_write(struct file *file, const struct iovec *iov, 2388cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
2389 unsigned long nr_segs, loff_t *poffset)
2390{ 2389{
2391 unsigned long nr_pages, i; 2390 unsigned long nr_pages, i;
2392 size_t bytes, copied, len, cur_len; 2391 size_t bytes, copied, len, cur_len;
2393 ssize_t total_written = 0; 2392 ssize_t total_written = 0;
2394 loff_t offset; 2393 loff_t offset;
2395 struct iov_iter it;
2396 struct cifsFileInfo *open_file; 2394 struct cifsFileInfo *open_file;
2397 struct cifs_tcon *tcon; 2395 struct cifs_tcon *tcon;
2398 struct cifs_sb_info *cifs_sb; 2396 struct cifs_sb_info *cifs_sb;
@@ -2401,14 +2399,16 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
2401 int rc; 2399 int rc;
2402 pid_t pid; 2400 pid_t pid;
2403 2401
2404 len = iov_length(iov, nr_segs); 2402 len = iov_iter_count(from);
2405 if (!len)
2406 return 0;
2407
2408 rc = generic_write_checks(file, poffset, &len, 0); 2403 rc = generic_write_checks(file, poffset, &len, 0);
2409 if (rc) 2404 if (rc)
2410 return rc; 2405 return rc;
2411 2406
2407 if (!len)
2408 return 0;
2409
2410 iov_iter_truncate(from, len);
2411
2412 INIT_LIST_HEAD(&wdata_list); 2412 INIT_LIST_HEAD(&wdata_list);
2413 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2413 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2414 open_file = file->private_data; 2414 open_file = file->private_data;
@@ -2424,7 +2424,6 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
2424 else 2424 else
2425 pid = current->tgid; 2425 pid = current->tgid;
2426 2426
2427 iov_iter_init(&it, iov, nr_segs, len, 0);
2428 do { 2427 do {
2429 size_t save_len; 2428 size_t save_len;
2430 2429
@@ -2444,11 +2443,10 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
2444 2443
2445 save_len = cur_len; 2444 save_len = cur_len;
2446 for (i = 0; i < nr_pages; i++) { 2445 for (i = 0; i < nr_pages; i++) {
2447 bytes = min_t(const size_t, cur_len, PAGE_SIZE); 2446 bytes = min_t(size_t, cur_len, PAGE_SIZE);
2448 copied = iov_iter_copy_from_user(wdata->pages[i], &it, 2447 copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
2449 0, bytes); 2448 from);
2450 cur_len -= copied; 2449 cur_len -= copied;
2451 iov_iter_advance(&it, copied);
2452 /* 2450 /*
2453 * If we didn't copy as much as we expected, then that 2451 * If we didn't copy as much as we expected, then that
2454 * may mean we trod into an unmapped area. Stop copying 2452 * may mean we trod into an unmapped area. Stop copying
@@ -2546,11 +2544,11 @@ restart_loop:
2546 return total_written ? total_written : (ssize_t)rc; 2544 return total_written ? total_written : (ssize_t)rc;
2547} 2545}
2548 2546
2549ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, 2547ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
2550 unsigned long nr_segs, loff_t pos)
2551{ 2548{
2552 ssize_t written; 2549 ssize_t written;
2553 struct inode *inode; 2550 struct inode *inode;
2551 loff_t pos = iocb->ki_pos;
2554 2552
2555 inode = file_inode(iocb->ki_filp); 2553 inode = file_inode(iocb->ki_filp);
2556 2554
@@ -2560,7 +2558,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
2560 * write request. 2558 * write request.
2561 */ 2559 */
2562 2560
2563 written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos); 2561 written = cifs_iovec_write(iocb->ki_filp, from, &pos);
2564 if (written > 0) { 2562 if (written > 0) {
2565 set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags); 2563 set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
2566 iocb->ki_pos = pos; 2564 iocb->ki_pos = pos;
@@ -2570,8 +2568,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
2570} 2568}
2571 2569
2572static ssize_t 2570static ssize_t
2573cifs_writev(struct kiocb *iocb, const struct iovec *iov, 2571cifs_writev(struct kiocb *iocb, struct iov_iter *from)
2574 unsigned long nr_segs, loff_t pos)
2575{ 2572{
2576 struct file *file = iocb->ki_filp; 2573 struct file *file = iocb->ki_filp;
2577 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; 2574 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2589,10 +2586,10 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2589 mutex_lock(&inode->i_mutex); 2586 mutex_lock(&inode->i_mutex);
2590 if (file->f_flags & O_APPEND) 2587 if (file->f_flags & O_APPEND)
2591 lock_pos = i_size_read(inode); 2588 lock_pos = i_size_read(inode);
2592 if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs), 2589 if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from),
2593 server->vals->exclusive_lock_type, NULL, 2590 server->vals->exclusive_lock_type, NULL,
2594 CIFS_WRITE_OP)) { 2591 CIFS_WRITE_OP)) {
2595 rc = __generic_file_aio_write(iocb, iov, nr_segs); 2592 rc = __generic_file_write_iter(iocb, from);
2596 mutex_unlock(&inode->i_mutex); 2593 mutex_unlock(&inode->i_mutex);
2597 2594
2598 if (rc > 0) { 2595 if (rc > 0) {
@@ -2610,8 +2607,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2610} 2607}
2611 2608
2612ssize_t 2609ssize_t
2613cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, 2610cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
2614 unsigned long nr_segs, loff_t pos)
2615{ 2611{
2616 struct inode *inode = file_inode(iocb->ki_filp); 2612 struct inode *inode = file_inode(iocb->ki_filp);
2617 struct cifsInodeInfo *cinode = CIFS_I(inode); 2613 struct cifsInodeInfo *cinode = CIFS_I(inode);
@@ -2629,11 +2625,10 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2629 if (cap_unix(tcon->ses) && 2625 if (cap_unix(tcon->ses) &&
2630 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) 2626 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
2631 && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { 2627 && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
2632 written = generic_file_aio_write( 2628 written = generic_file_write_iter(iocb, from);
2633 iocb, iov, nr_segs, pos);
2634 goto out; 2629 goto out;
2635 } 2630 }
2636 written = cifs_writev(iocb, iov, nr_segs, pos); 2631 written = cifs_writev(iocb, from);
2637 goto out; 2632 goto out;
2638 } 2633 }
2639 /* 2634 /*
@@ -2642,7 +2637,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2642 * affected pages because it may cause a error with mandatory locks on 2637 * affected pages because it may cause a error with mandatory locks on
2643 * these pages but not on the region from pos to ppos+len-1. 2638 * these pages but not on the region from pos to ppos+len-1.
2644 */ 2639 */
2645 written = cifs_user_writev(iocb, iov, nr_segs, pos); 2640 written = cifs_user_writev(iocb, from);
2646 if (written > 0 && CIFS_CACHE_READ(cinode)) { 2641 if (written > 0 && CIFS_CACHE_READ(cinode)) {
2647 /* 2642 /*
2648 * Windows 7 server can delay breaking level2 oplock if a write 2643 * Windows 7 server can delay breaking level2 oplock if a write
@@ -2831,32 +2826,25 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
2831 return total_read > 0 ? total_read : result; 2826 return total_read > 0 ? total_read : result;
2832} 2827}
2833 2828
2834ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, 2829ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
2835 unsigned long nr_segs, loff_t pos)
2836{ 2830{
2837 struct file *file = iocb->ki_filp; 2831 struct file *file = iocb->ki_filp;
2838 ssize_t rc; 2832 ssize_t rc;
2839 size_t len, cur_len; 2833 size_t len, cur_len;
2840 ssize_t total_read = 0; 2834 ssize_t total_read = 0;
2841 loff_t offset = pos; 2835 loff_t offset = iocb->ki_pos;
2842 unsigned int npages; 2836 unsigned int npages;
2843 struct cifs_sb_info *cifs_sb; 2837 struct cifs_sb_info *cifs_sb;
2844 struct cifs_tcon *tcon; 2838 struct cifs_tcon *tcon;
2845 struct cifsFileInfo *open_file; 2839 struct cifsFileInfo *open_file;
2846 struct cifs_readdata *rdata, *tmp; 2840 struct cifs_readdata *rdata, *tmp;
2847 struct list_head rdata_list; 2841 struct list_head rdata_list;
2848 struct iov_iter to;
2849 pid_t pid; 2842 pid_t pid;
2850 2843
2851 if (!nr_segs) 2844 len = iov_iter_count(to);
2852 return 0;
2853
2854 len = iov_length(iov, nr_segs);
2855 if (!len) 2845 if (!len)
2856 return 0; 2846 return 0;
2857 2847
2858 iov_iter_init(&to, iov, nr_segs, len, 0);
2859
2860 INIT_LIST_HEAD(&rdata_list); 2848 INIT_LIST_HEAD(&rdata_list);
2861 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2849 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2862 open_file = file->private_data; 2850 open_file = file->private_data;
@@ -2914,7 +2902,7 @@ error:
2914 if (!list_empty(&rdata_list)) 2902 if (!list_empty(&rdata_list))
2915 rc = 0; 2903 rc = 0;
2916 2904
2917 len = iov_iter_count(&to); 2905 len = iov_iter_count(to);
2918 /* the loop below should proceed in the order of increasing offsets */ 2906 /* the loop below should proceed in the order of increasing offsets */
2919 list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { 2907 list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
2920 again: 2908 again:
@@ -2931,7 +2919,7 @@ error:
2931 goto again; 2919 goto again;
2932 } 2920 }
2933 } else { 2921 } else {
2934 rc = cifs_readdata_to_iov(rdata, &to); 2922 rc = cifs_readdata_to_iov(rdata, to);
2935 } 2923 }
2936 2924
2937 } 2925 }
@@ -2939,7 +2927,7 @@ error:
2939 kref_put(&rdata->refcount, cifs_uncached_readdata_release); 2927 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
2940 } 2928 }
2941 2929
2942 total_read = len - iov_iter_count(&to); 2930 total_read = len - iov_iter_count(to);
2943 2931
2944 cifs_stats_bytes_read(tcon, total_read); 2932 cifs_stats_bytes_read(tcon, total_read);
2945 2933
@@ -2948,15 +2936,14 @@ error:
2948 rc = 0; 2936 rc = 0;
2949 2937
2950 if (total_read) { 2938 if (total_read) {
2951 iocb->ki_pos = pos + total_read; 2939 iocb->ki_pos += total_read;
2952 return total_read; 2940 return total_read;
2953 } 2941 }
2954 return rc; 2942 return rc;
2955} 2943}
2956 2944
2957ssize_t 2945ssize_t
2958cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, 2946cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
2959 unsigned long nr_segs, loff_t pos)
2960{ 2947{
2961 struct inode *inode = file_inode(iocb->ki_filp); 2948 struct inode *inode = file_inode(iocb->ki_filp);
2962 struct cifsInodeInfo *cinode = CIFS_I(inode); 2949 struct cifsInodeInfo *cinode = CIFS_I(inode);
@@ -2975,22 +2962,22 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
2975 * pos+len-1. 2962 * pos+len-1.
2976 */ 2963 */
2977 if (!CIFS_CACHE_READ(cinode)) 2964 if (!CIFS_CACHE_READ(cinode))
2978 return cifs_user_readv(iocb, iov, nr_segs, pos); 2965 return cifs_user_readv(iocb, to);
2979 2966
2980 if (cap_unix(tcon->ses) && 2967 if (cap_unix(tcon->ses) &&
2981 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 2968 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
2982 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 2969 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
2983 return generic_file_aio_read(iocb, iov, nr_segs, pos); 2970 return generic_file_read_iter(iocb, to);
2984 2971
2985 /* 2972 /*
2986 * We need to hold the sem to be sure nobody modifies lock list 2973 * We need to hold the sem to be sure nobody modifies lock list
2987 * with a brlock that prevents reading. 2974 * with a brlock that prevents reading.
2988 */ 2975 */
2989 down_read(&cinode->lock_sem); 2976 down_read(&cinode->lock_sem);
2990 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), 2977 if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
2991 tcon->ses->server->vals->shared_lock_type, 2978 tcon->ses->server->vals->shared_lock_type,
2992 NULL, CIFS_READ_OP)) 2979 NULL, CIFS_READ_OP))
2993 rc = generic_file_aio_read(iocb, iov, nr_segs, pos); 2980 rc = generic_file_read_iter(iocb, to);
2994 up_read(&cinode->lock_sem); 2981 up_read(&cinode->lock_sem);
2995 return rc; 2982 return rc;
2996} 2983}
@@ -3703,8 +3690,8 @@ void cifs_oplock_break(struct work_struct *work)
3703 * Direct IO is not yet supported in the cached mode. 3690 * Direct IO is not yet supported in the cached mode.
3704 */ 3691 */
3705static ssize_t 3692static ssize_t
3706cifs_direct_io(int rw, struct kiocb *iocb, const struct iovec *iov, 3693cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter,
3707 loff_t pos, unsigned long nr_segs) 3694 loff_t pos)
3708{ 3695{
3709 /* 3696 /*
3710 * FIXME 3697 * FIXME
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 264ece71bdb2..68559fd557fb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -374,7 +374,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
374 oparms.cifs_sb = cifs_sb; 374 oparms.cifs_sb = cifs_sb;
375 oparms.desired_access = GENERIC_WRITE; 375 oparms.desired_access = GENERIC_WRITE;
376 oparms.create_options = create_options; 376 oparms.create_options = create_options;
377 oparms.disposition = FILE_OPEN; 377 oparms.disposition = FILE_CREATE;
378 oparms.path = path; 378 oparms.path = path;
379 oparms.fid = &fid; 379 oparms.fid = &fid;
380 oparms.reconnect = false; 380 oparms.reconnect = false;
diff --git a/fs/dcache.c b/fs/dcache.c
index 1792d6075b4f..06f65857a855 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -532,10 +532,12 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
532 struct dentry *parent = dentry->d_parent; 532 struct dentry *parent = dentry->d_parent;
533 if (IS_ROOT(dentry)) 533 if (IS_ROOT(dentry))
534 return NULL; 534 return NULL;
535 if (unlikely((int)dentry->d_lockref.count < 0))
536 return NULL;
535 if (likely(spin_trylock(&parent->d_lock))) 537 if (likely(spin_trylock(&parent->d_lock)))
536 return parent; 538 return parent;
537 spin_unlock(&dentry->d_lock);
538 rcu_read_lock(); 539 rcu_read_lock();
540 spin_unlock(&dentry->d_lock);
539again: 541again:
540 parent = ACCESS_ONCE(dentry->d_parent); 542 parent = ACCESS_ONCE(dentry->d_parent);
541 spin_lock(&parent->d_lock); 543 spin_lock(&parent->d_lock);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 31ba0935e32e..98040ba388ac 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -77,7 +77,6 @@ struct dio_submit {
77 unsigned blocks_available; /* At block_in_file. changes */ 77 unsigned blocks_available; /* At block_in_file. changes */
78 int reap_counter; /* rate limit reaping */ 78 int reap_counter; /* rate limit reaping */
79 sector_t final_block_in_request;/* doesn't change */ 79 sector_t final_block_in_request;/* doesn't change */
80 unsigned first_block_in_page; /* doesn't change, Used only once */
81 int boundary; /* prev block is at a boundary */ 80 int boundary; /* prev block is at a boundary */
82 get_block_t *get_block; /* block mapping function */ 81 get_block_t *get_block; /* block mapping function */
83 dio_submit_t *submit_io; /* IO submition function */ 82 dio_submit_t *submit_io; /* IO submition function */
@@ -98,19 +97,14 @@ struct dio_submit {
98 sector_t cur_page_block; /* Where it starts */ 97 sector_t cur_page_block; /* Where it starts */
99 loff_t cur_page_fs_offset; /* Offset in file */ 98 loff_t cur_page_fs_offset; /* Offset in file */
100 99
101 /* 100 struct iov_iter *iter;
102 * Page fetching state. These variables belong to dio_refill_pages().
103 */
104 int curr_page; /* changes */
105 int total_pages; /* doesn't change */
106 unsigned long curr_user_address;/* changes */
107
108 /* 101 /*
109 * Page queue. These variables belong to dio_refill_pages() and 102 * Page queue. These variables belong to dio_refill_pages() and
110 * dio_get_page(). 103 * dio_get_page().
111 */ 104 */
112 unsigned head; /* next page to process */ 105 unsigned head; /* next page to process */
113 unsigned tail; /* last valid page + 1 */ 106 unsigned tail; /* last valid page + 1 */
107 size_t from, to;
114}; 108};
115 109
116/* dio_state communicated between submission path and end_io */ 110/* dio_state communicated between submission path and end_io */
@@ -163,15 +157,10 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
163 */ 157 */
164static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) 158static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
165{ 159{
166 int ret; 160 ssize_t ret;
167 int nr_pages;
168 161
169 nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); 162 ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
170 ret = get_user_pages_fast( 163 &sdio->from);
171 sdio->curr_user_address, /* Where from? */
172 nr_pages, /* How many pages? */
173 dio->rw == READ, /* Write to memory? */
174 &dio->pages[0]); /* Put results here */
175 164
176 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { 165 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
177 struct page *page = ZERO_PAGE(0); 166 struct page *page = ZERO_PAGE(0);
@@ -186,18 +175,19 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
186 dio->pages[0] = page; 175 dio->pages[0] = page;
187 sdio->head = 0; 176 sdio->head = 0;
188 sdio->tail = 1; 177 sdio->tail = 1;
189 ret = 0; 178 sdio->from = 0;
190 goto out; 179 sdio->to = PAGE_SIZE;
180 return 0;
191 } 181 }
192 182
193 if (ret >= 0) { 183 if (ret >= 0) {
194 sdio->curr_user_address += ret * PAGE_SIZE; 184 iov_iter_advance(sdio->iter, ret);
195 sdio->curr_page += ret; 185 ret += sdio->from;
196 sdio->head = 0; 186 sdio->head = 0;
197 sdio->tail = ret; 187 sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
198 ret = 0; 188 sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
189 return 0;
199 } 190 }
200out:
201 return ret; 191 return ret;
202} 192}
203 193
@@ -208,8 +198,9 @@ out:
208 * L1 cache. 198 * L1 cache.
209 */ 199 */
210static inline struct page *dio_get_page(struct dio *dio, 200static inline struct page *dio_get_page(struct dio *dio,
211 struct dio_submit *sdio) 201 struct dio_submit *sdio, size_t *from, size_t *to)
212{ 202{
203 int n;
213 if (dio_pages_present(sdio) == 0) { 204 if (dio_pages_present(sdio) == 0) {
214 int ret; 205 int ret;
215 206
@@ -218,7 +209,10 @@ static inline struct page *dio_get_page(struct dio *dio,
218 return ERR_PTR(ret); 209 return ERR_PTR(ret);
219 BUG_ON(dio_pages_present(sdio) == 0); 210 BUG_ON(dio_pages_present(sdio) == 0);
220 } 211 }
221 return dio->pages[sdio->head++]; 212 n = sdio->head++;
213 *from = n ? 0 : sdio->from;
214 *to = (n == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
215 return dio->pages[n];
222} 216}
223 217
224/** 218/**
@@ -422,8 +416,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
422 */ 416 */
423static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) 417static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
424{ 418{
425 while (dio_pages_present(sdio)) 419 while (sdio->head < sdio->tail)
426 page_cache_release(dio_get_page(dio, sdio)); 420 page_cache_release(dio->pages[sdio->head++]);
427} 421}
428 422
429/* 423/*
@@ -912,23 +906,18 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
912 struct buffer_head *map_bh) 906 struct buffer_head *map_bh)
913{ 907{
914 const unsigned blkbits = sdio->blkbits; 908 const unsigned blkbits = sdio->blkbits;
915 const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
916 struct page *page;
917 unsigned block_in_page;
918 int ret = 0; 909 int ret = 0;
919 910
920 /* The I/O can start at any block offset within the first page */
921 block_in_page = sdio->first_block_in_page;
922
923 while (sdio->block_in_file < sdio->final_block_in_request) { 911 while (sdio->block_in_file < sdio->final_block_in_request) {
924 page = dio_get_page(dio, sdio); 912 struct page *page;
913 size_t from, to;
914 page = dio_get_page(dio, sdio, &from, &to);
925 if (IS_ERR(page)) { 915 if (IS_ERR(page)) {
926 ret = PTR_ERR(page); 916 ret = PTR_ERR(page);
927 goto out; 917 goto out;
928 } 918 }
929 919
930 while (block_in_page < blocks_per_page) { 920 while (from < to) {
931 unsigned offset_in_page = block_in_page << blkbits;
932 unsigned this_chunk_bytes; /* # of bytes mapped */ 921 unsigned this_chunk_bytes; /* # of bytes mapped */
933 unsigned this_chunk_blocks; /* # of blocks */ 922 unsigned this_chunk_blocks; /* # of blocks */
934 unsigned u; 923 unsigned u;
@@ -999,10 +988,10 @@ do_holes:
999 page_cache_release(page); 988 page_cache_release(page);
1000 goto out; 989 goto out;
1001 } 990 }
1002 zero_user(page, block_in_page << blkbits, 991 zero_user(page, from, 1 << blkbits);
1003 1 << blkbits);
1004 sdio->block_in_file++; 992 sdio->block_in_file++;
1005 block_in_page++; 993 from += 1 << blkbits;
994 dio->result += 1 << blkbits;
1006 goto next_block; 995 goto next_block;
1007 } 996 }
1008 997
@@ -1019,7 +1008,7 @@ do_holes:
1019 * can add to this page 1008 * can add to this page
1020 */ 1009 */
1021 this_chunk_blocks = sdio->blocks_available; 1010 this_chunk_blocks = sdio->blocks_available;
1022 u = (PAGE_SIZE - offset_in_page) >> blkbits; 1011 u = (to - from) >> blkbits;
1023 if (this_chunk_blocks > u) 1012 if (this_chunk_blocks > u)
1024 this_chunk_blocks = u; 1013 this_chunk_blocks = u;
1025 u = sdio->final_block_in_request - sdio->block_in_file; 1014 u = sdio->final_block_in_request - sdio->block_in_file;
@@ -1031,7 +1020,7 @@ do_holes:
1031 if (this_chunk_blocks == sdio->blocks_available) 1020 if (this_chunk_blocks == sdio->blocks_available)
1032 sdio->boundary = buffer_boundary(map_bh); 1021 sdio->boundary = buffer_boundary(map_bh);
1033 ret = submit_page_section(dio, sdio, page, 1022 ret = submit_page_section(dio, sdio, page,
1034 offset_in_page, 1023 from,
1035 this_chunk_bytes, 1024 this_chunk_bytes,
1036 sdio->next_block_for_io, 1025 sdio->next_block_for_io,
1037 map_bh); 1026 map_bh);
@@ -1042,7 +1031,8 @@ do_holes:
1042 sdio->next_block_for_io += this_chunk_blocks; 1031 sdio->next_block_for_io += this_chunk_blocks;
1043 1032
1044 sdio->block_in_file += this_chunk_blocks; 1033 sdio->block_in_file += this_chunk_blocks;
1045 block_in_page += this_chunk_blocks; 1034 from += this_chunk_bytes;
1035 dio->result += this_chunk_bytes;
1046 sdio->blocks_available -= this_chunk_blocks; 1036 sdio->blocks_available -= this_chunk_blocks;
1047next_block: 1037next_block:
1048 BUG_ON(sdio->block_in_file > sdio->final_block_in_request); 1038 BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
@@ -1052,7 +1042,6 @@ next_block:
1052 1042
1053 /* Drop the ref which was taken in get_user_pages() */ 1043 /* Drop the ref which was taken in get_user_pages() */
1054 page_cache_release(page); 1044 page_cache_release(page);
1055 block_in_page = 0;
1056 } 1045 }
1057out: 1046out:
1058 return ret; 1047 return ret;
@@ -1107,24 +1096,20 @@ static inline int drop_refcount(struct dio *dio)
1107 */ 1096 */
1108static inline ssize_t 1097static inline ssize_t
1109do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1098do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1110 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1099 struct block_device *bdev, struct iov_iter *iter, loff_t offset,
1111 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1100 get_block_t get_block, dio_iodone_t end_io,
1112 dio_submit_t submit_io, int flags) 1101 dio_submit_t submit_io, int flags)
1113{ 1102{
1114 int seg;
1115 size_t size;
1116 unsigned long addr;
1117 unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); 1103 unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
1118 unsigned blkbits = i_blkbits; 1104 unsigned blkbits = i_blkbits;
1119 unsigned blocksize_mask = (1 << blkbits) - 1; 1105 unsigned blocksize_mask = (1 << blkbits) - 1;
1120 ssize_t retval = -EINVAL; 1106 ssize_t retval = -EINVAL;
1121 loff_t end = offset; 1107 loff_t end = offset + iov_iter_count(iter);
1122 struct dio *dio; 1108 struct dio *dio;
1123 struct dio_submit sdio = { 0, }; 1109 struct dio_submit sdio = { 0, };
1124 unsigned long user_addr;
1125 size_t bytes;
1126 struct buffer_head map_bh = { 0, }; 1110 struct buffer_head map_bh = { 0, };
1127 struct blk_plug plug; 1111 struct blk_plug plug;
1112 unsigned long align = offset | iov_iter_alignment(iter);
1128 1113
1129 if (rw & WRITE) 1114 if (rw & WRITE)
1130 rw = WRITE_ODIRECT; 1115 rw = WRITE_ODIRECT;
@@ -1134,32 +1119,16 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1134 * the early prefetch in the caller enough time. 1119 * the early prefetch in the caller enough time.
1135 */ 1120 */
1136 1121
1137 if (offset & blocksize_mask) { 1122 if (align & blocksize_mask) {
1138 if (bdev) 1123 if (bdev)
1139 blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1124 blkbits = blksize_bits(bdev_logical_block_size(bdev));
1140 blocksize_mask = (1 << blkbits) - 1; 1125 blocksize_mask = (1 << blkbits) - 1;
1141 if (offset & blocksize_mask) 1126 if (align & blocksize_mask)
1142 goto out; 1127 goto out;
1143 } 1128 }
1144 1129
1145 /* Check the memory alignment. Blocks cannot straddle pages */
1146 for (seg = 0; seg < nr_segs; seg++) {
1147 addr = (unsigned long)iov[seg].iov_base;
1148 size = iov[seg].iov_len;
1149 end += size;
1150 if (unlikely((addr & blocksize_mask) ||
1151 (size & blocksize_mask))) {
1152 if (bdev)
1153 blkbits = blksize_bits(
1154 bdev_logical_block_size(bdev));
1155 blocksize_mask = (1 << blkbits) - 1;
1156 if ((addr & blocksize_mask) || (size & blocksize_mask))
1157 goto out;
1158 }
1159 }
1160
1161 /* watch out for a 0 len io from a tricksy fs */ 1130 /* watch out for a 0 len io from a tricksy fs */
1162 if (rw == READ && end == offset) 1131 if (rw == READ && !iov_iter_count(iter))
1163 return 0; 1132 return 0;
1164 1133
1165 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); 1134 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1249,6 +1218,10 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1249 spin_lock_init(&dio->bio_lock); 1218 spin_lock_init(&dio->bio_lock);
1250 dio->refcount = 1; 1219 dio->refcount = 1;
1251 1220
1221 sdio.iter = iter;
1222 sdio.final_block_in_request =
1223 (offset + iov_iter_count(iter)) >> blkbits;
1224
1252 /* 1225 /*
1253 * In case of non-aligned buffers, we may need 2 more 1226 * In case of non-aligned buffers, we may need 2 more
1254 * pages since we need to zero out first and last block. 1227 * pages since we need to zero out first and last block.
@@ -1256,47 +1229,13 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1256 if (unlikely(sdio.blkfactor)) 1229 if (unlikely(sdio.blkfactor))
1257 sdio.pages_in_io = 2; 1230 sdio.pages_in_io = 2;
1258 1231
1259 for (seg = 0; seg < nr_segs; seg++) { 1232 sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
1260 user_addr = (unsigned long)iov[seg].iov_base;
1261 sdio.pages_in_io +=
1262 ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
1263 PAGE_SIZE - user_addr / PAGE_SIZE);
1264 }
1265 1233
1266 blk_start_plug(&plug); 1234 blk_start_plug(&plug);
1267 1235
1268 for (seg = 0; seg < nr_segs; seg++) { 1236 retval = do_direct_IO(dio, &sdio, &map_bh);
1269 user_addr = (unsigned long)iov[seg].iov_base; 1237 if (retval)
1270 sdio.size += bytes = iov[seg].iov_len; 1238 dio_cleanup(dio, &sdio);
1271
1272 /* Index into the first page of the first block */
1273 sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1274 sdio.final_block_in_request = sdio.block_in_file +
1275 (bytes >> blkbits);
1276 /* Page fetching state */
1277 sdio.head = 0;
1278 sdio.tail = 0;
1279 sdio.curr_page = 0;
1280
1281 sdio.total_pages = 0;
1282 if (user_addr & (PAGE_SIZE-1)) {
1283 sdio.total_pages++;
1284 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1285 }
1286 sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1287 sdio.curr_user_address = user_addr;
1288
1289 retval = do_direct_IO(dio, &sdio, &map_bh);
1290
1291 dio->result += iov[seg].iov_len -
1292 ((sdio.final_block_in_request - sdio.block_in_file) <<
1293 blkbits);
1294
1295 if (retval) {
1296 dio_cleanup(dio, &sdio);
1297 break;
1298 }
1299 } /* end iovec loop */
1300 1239
1301 if (retval == -ENOTBLK) { 1240 if (retval == -ENOTBLK) {
1302 /* 1241 /*
@@ -1365,8 +1304,8 @@ out:
1365 1304
1366ssize_t 1305ssize_t
1367__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1306__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1368 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1307 struct block_device *bdev, struct iov_iter *iter, loff_t offset,
1369 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1308 get_block_t get_block, dio_iodone_t end_io,
1370 dio_submit_t submit_io, int flags) 1309 dio_submit_t submit_io, int flags)
1371{ 1310{
1372 /* 1311 /*
@@ -1381,9 +1320,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1381 prefetch(bdev->bd_queue); 1320 prefetch(bdev->bd_queue);
1382 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); 1321 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
1383 1322
1384 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 1323 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset,
1385 nr_segs, get_block, end_io, 1324 get_block, end_io, submit_io, flags);
1386 submit_io, flags);
1387} 1325}
1388 1326
1389EXPORT_SYMBOL(__blockdev_direct_IO); 1327EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 1e5b45359509..d08e079ea5d3 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -617,6 +617,11 @@ static void retry_failed_sctp_send(struct connection *recv_con,
617 int nodeid = sn_send_failed->ssf_info.sinfo_ppid; 617 int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
618 618
619 log_print("Retry sending %d bytes to node id %d", len, nodeid); 619 log_print("Retry sending %d bytes to node id %d", len, nodeid);
620
621 if (!nodeid) {
622 log_print("Shouldn't resend data via listening connection.");
623 return;
624 }
620 625
621 con = nodeid2con(nodeid, 0); 626 con = nodeid2con(nodeid, 0);
622 if (!con) { 627 if (!con) {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index b1eaa7a1f82c..db0fad3269c0 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -45,14 +45,13 @@
45 * The function to be used for directory reads is ecryptfs_read. 45 * The function to be used for directory reads is ecryptfs_read.
46 */ 46 */
47static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, 47static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
48 const struct iovec *iov, 48 struct iov_iter *to)
49 unsigned long nr_segs, loff_t pos)
50{ 49{
51 ssize_t rc; 50 ssize_t rc;
52 struct path *path; 51 struct path *path;
53 struct file *file = iocb->ki_filp; 52 struct file *file = iocb->ki_filp;
54 53
55 rc = generic_file_aio_read(iocb, iov, nr_segs, pos); 54 rc = generic_file_read_iter(iocb, to);
56 /* 55 /*
57 * Even though this is a async interface, we need to wait 56 * Even though this is a async interface, we need to wait
58 * for IO to finish to update atime 57 * for IO to finish to update atime
@@ -352,10 +351,10 @@ const struct file_operations ecryptfs_dir_fops = {
352 351
353const struct file_operations ecryptfs_main_fops = { 352const struct file_operations ecryptfs_main_fops = {
354 .llseek = generic_file_llseek, 353 .llseek = generic_file_llseek,
355 .read = do_sync_read, 354 .read = new_sync_read,
356 .aio_read = ecryptfs_read_update_atime, 355 .read_iter = ecryptfs_read_update_atime,
357 .write = do_sync_write, 356 .write = new_sync_write,
358 .aio_write = generic_file_aio_write, 357 .write_iter = generic_file_write_iter,
359 .iterate = ecryptfs_readdir, 358 .iterate = ecryptfs_readdir,
360 .unlocked_ioctl = ecryptfs_unlocked_ioctl, 359 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
361#ifdef CONFIG_COMPAT 360#ifdef CONFIG_COMPAT
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b73e0621ce9e..b10b48c2a7af 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -910,7 +910,7 @@ static const struct file_operations eventpoll_fops = {
910void eventpoll_release_file(struct file *file) 910void eventpoll_release_file(struct file *file)
911{ 911{
912 struct eventpoll *ep; 912 struct eventpoll *ep;
913 struct epitem *epi; 913 struct epitem *epi, *next;
914 914
915 /* 915 /*
916 * We don't want to get "file->f_lock" because it is not 916 * We don't want to get "file->f_lock" because it is not
@@ -926,7 +926,7 @@ void eventpoll_release_file(struct file *file)
926 * Besides, ep_remove() acquires the lock, so we can't hold it here. 926 * Besides, ep_remove() acquires the lock, so we can't hold it here.
927 */ 927 */
928 mutex_lock(&epmutex); 928 mutex_lock(&epmutex);
929 list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { 929 list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
930 ep = epi->ep; 930 ep = epi->ep;
931 mutex_lock_nested(&ep->mtx, 0); 931 mutex_lock_nested(&ep->mtx, 0);
932 ep_remove(ep, epi); 932 ep_remove(ep, epi);
diff --git a/fs/exec.c b/fs/exec.c
index 238b7aa26f68..a3d33fe592d6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1046,13 +1046,13 @@ EXPORT_SYMBOL_GPL(get_task_comm);
1046 * so that a new one can be started 1046 * so that a new one can be started
1047 */ 1047 */
1048 1048
1049void set_task_comm(struct task_struct *tsk, const char *buf) 1049void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
1050{ 1050{
1051 task_lock(tsk); 1051 task_lock(tsk);
1052 trace_task_rename(tsk, buf); 1052 trace_task_rename(tsk, buf);
1053 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 1053 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
1054 task_unlock(tsk); 1054 task_unlock(tsk);
1055 perf_event_comm(tsk); 1055 perf_event_comm(tsk, exec);
1056} 1056}
1057 1057
1058int flush_old_exec(struct linux_binprm * bprm) 1058int flush_old_exec(struct linux_binprm * bprm)
@@ -1110,7 +1110,8 @@ void setup_new_exec(struct linux_binprm * bprm)
1110 else 1110 else
1111 set_dumpable(current->mm, suid_dumpable); 1111 set_dumpable(current->mm, suid_dumpable);
1112 1112
1113 set_task_comm(current, kbasename(bprm->filename)); 1113 perf_event_exec();
1114 __set_task_comm(current, kbasename(bprm->filename), true);
1114 1115
1115 /* Set the new mm task size. We have to do that late because it may 1116 /* Set the new mm task size. We have to do that late because it may
1116 * depend on TIF_32BIT which is only updated in flush_thread() on 1117 * depend on TIF_32BIT which is only updated in flush_thread() on
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 491c6c078e7f..71bf8e4fb5d4 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -67,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id)
67 67
68const struct file_operations exofs_file_operations = { 68const struct file_operations exofs_file_operations = {
69 .llseek = generic_file_llseek, 69 .llseek = generic_file_llseek,
70 .read = do_sync_read, 70 .read = new_sync_read,
71 .write = do_sync_write, 71 .write = new_sync_write,
72 .aio_read = generic_file_aio_read, 72 .read_iter = generic_file_read_iter,
73 .aio_write = generic_file_aio_write, 73 .write_iter = generic_file_write_iter,
74 .mmap = generic_file_mmap, 74 .mmap = generic_file_mmap,
75 .open = generic_file_open, 75 .open = generic_file_open,
76 .release = exofs_release_file, 76 .release = exofs_release_file,
77 .fsync = exofs_file_fsync, 77 .fsync = exofs_file_fsync,
78 .flush = exofs_flush, 78 .flush = exofs_flush,
79 .splice_read = generic_file_splice_read, 79 .splice_read = generic_file_splice_read,
80 .splice_write = generic_file_splice_write, 80 .splice_write = iter_file_splice_write,
81}; 81};
82 82
83const struct inode_operations exofs_file_inode_operations = { 83const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1c244d67667..3f9cafd73931 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -964,7 +964,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
964 964
965 /* TODO: Should be easy enough to do proprly */ 965 /* TODO: Should be easy enough to do proprly */
966static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, 966static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
967 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 967 struct iov_iter *iter, loff_t offset)
968{ 968{
969 return 0; 969 return 0;
970} 970}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 44c36e590765..7c87b22a7228 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -62,10 +62,10 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
62 */ 62 */
63const struct file_operations ext2_file_operations = { 63const struct file_operations ext2_file_operations = {
64 .llseek = generic_file_llseek, 64 .llseek = generic_file_llseek,
65 .read = do_sync_read, 65 .read = new_sync_read,
66 .write = do_sync_write, 66 .write = new_sync_write,
67 .aio_read = generic_file_aio_read, 67 .read_iter = generic_file_read_iter,
68 .aio_write = generic_file_aio_write, 68 .write_iter = generic_file_write_iter,
69 .unlocked_ioctl = ext2_ioctl, 69 .unlocked_ioctl = ext2_ioctl,
70#ifdef CONFIG_COMPAT 70#ifdef CONFIG_COMPAT
71 .compat_ioctl = ext2_compat_ioctl, 71 .compat_ioctl = ext2_compat_ioctl,
@@ -75,7 +75,7 @@ const struct file_operations ext2_file_operations = {
75 .release = ext2_release_file, 75 .release = ext2_release_file,
76 .fsync = ext2_fsync, 76 .fsync = ext2_fsync,
77 .splice_read = generic_file_splice_read, 77 .splice_read = generic_file_splice_read,
78 .splice_write = generic_file_splice_write, 78 .splice_write = iter_file_splice_write,
79}; 79};
80 80
81#ifdef CONFIG_EXT2_FS_XIP 81#ifdef CONFIG_EXT2_FS_XIP
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b1d2a4675d42..36d35c36311d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -850,18 +850,18 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
850} 850}
851 851
852static ssize_t 852static ssize_t
853ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 853ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
854 loff_t offset, unsigned long nr_segs) 854 loff_t offset)
855{ 855{
856 struct file *file = iocb->ki_filp; 856 struct file *file = iocb->ki_filp;
857 struct address_space *mapping = file->f_mapping; 857 struct address_space *mapping = file->f_mapping;
858 struct inode *inode = mapping->host; 858 struct inode *inode = mapping->host;
859 size_t count = iov_iter_count(iter);
859 ssize_t ret; 860 ssize_t ret;
860 861
861 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 862 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
862 ext2_get_block);
863 if (ret < 0 && (rw & WRITE)) 863 if (ret < 0 && (rw & WRITE))
864 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); 864 ext2_write_failed(mapping, offset + count);
865 return ret; 865 return ret;
866} 866}
867 867
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index aad05311392a..a062fa1e1b11 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -50,10 +50,10 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
50 50
51const struct file_operations ext3_file_operations = { 51const struct file_operations ext3_file_operations = {
52 .llseek = generic_file_llseek, 52 .llseek = generic_file_llseek,
53 .read = do_sync_read, 53 .read = new_sync_read,
54 .write = do_sync_write, 54 .write = new_sync_write,
55 .aio_read = generic_file_aio_read, 55 .read_iter = generic_file_read_iter,
56 .aio_write = generic_file_aio_write, 56 .write_iter = generic_file_write_iter,
57 .unlocked_ioctl = ext3_ioctl, 57 .unlocked_ioctl = ext3_ioctl,
58#ifdef CONFIG_COMPAT 58#ifdef CONFIG_COMPAT
59 .compat_ioctl = ext3_compat_ioctl, 59 .compat_ioctl = ext3_compat_ioctl,
@@ -63,7 +63,7 @@ const struct file_operations ext3_file_operations = {
63 .release = ext3_release_file, 63 .release = ext3_release_file,
64 .fsync = ext3_sync_file, 64 .fsync = ext3_sync_file,
65 .splice_read = generic_file_splice_read, 65 .splice_read = generic_file_splice_read,
66 .splice_write = generic_file_splice_write, 66 .splice_write = iter_file_splice_write,
67}; 67};
68 68
69const struct inode_operations ext3_file_inode_operations = { 69const struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 695abe738a24..2c6ccc49ba27 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1821,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
1821 * VFS code falls back into buffered path in that case so we are safe. 1821 * VFS code falls back into buffered path in that case so we are safe.
1822 */ 1822 */
1823static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, 1823static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1824 const struct iovec *iov, loff_t offset, 1824 struct iov_iter *iter, loff_t offset)
1825 unsigned long nr_segs)
1826{ 1825{
1827 struct file *file = iocb->ki_filp; 1826 struct file *file = iocb->ki_filp;
1828 struct inode *inode = file->f_mapping->host; 1827 struct inode *inode = file->f_mapping->host;
@@ -1830,10 +1829,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1830 handle_t *handle; 1829 handle_t *handle;
1831 ssize_t ret; 1830 ssize_t ret;
1832 int orphan = 0; 1831 int orphan = 0;
1833 size_t count = iov_length(iov, nr_segs); 1832 size_t count = iov_iter_count(iter);
1834 int retries = 0; 1833 int retries = 0;
1835 1834
1836 trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 1835 trace_ext3_direct_IO_enter(inode, offset, count, rw);
1837 1836
1838 if (rw == WRITE) { 1837 if (rw == WRITE) {
1839 loff_t final_size = offset + count; 1838 loff_t final_size = offset + count;
@@ -1857,15 +1856,14 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1857 } 1856 }
1858 1857
1859retry: 1858retry:
1860 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 1859 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
1861 ext3_get_block);
1862 /* 1860 /*
1863 * In case of error extending write may have instantiated a few 1861 * In case of error extending write may have instantiated a few
1864 * blocks outside i_size. Trim these off again. 1862 * blocks outside i_size. Trim these off again.
1865 */ 1863 */
1866 if (unlikely((rw & WRITE) && ret < 0)) { 1864 if (unlikely((rw & WRITE) && ret < 0)) {
1867 loff_t isize = i_size_read(inode); 1865 loff_t isize = i_size_read(inode);
1868 loff_t end = offset + iov_length(iov, nr_segs); 1866 loff_t end = offset + count;
1869 1867
1870 if (end > isize) 1868 if (end > isize)
1871 ext3_truncate_failed_direct_write(inode); 1869 ext3_truncate_failed_direct_write(inode);
@@ -1910,8 +1908,7 @@ retry:
1910 ret = err; 1908 ret = err;
1911 } 1909 }
1912out: 1910out:
1913 trace_ext3_direct_IO_exit(inode, offset, 1911 trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
1914 iov_length(iov, nr_segs), rw, ret);
1915 return ret; 1912 return ret;
1916} 1913}
1917 1914
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0762d143e252..fca382037ddd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -194,7 +194,16 @@ static void ext4_init_block_bitmap(struct super_block *sb,
194 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { 194 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
195 ext4_error(sb, "Checksum bad for group %u", block_group); 195 ext4_error(sb, "Checksum bad for group %u", block_group);
196 grp = ext4_get_group_info(sb, block_group); 196 grp = ext4_get_group_info(sb, block_group);
197 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
198 percpu_counter_sub(&sbi->s_freeclusters_counter,
199 grp->bb_free);
197 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); 200 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
201 if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
202 int count;
203 count = ext4_free_inodes_count(sb, gdp);
204 percpu_counter_sub(&sbi->s_freeinodes_counter,
205 count);
206 }
198 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); 207 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
199 return; 208 return;
200 } 209 }
@@ -359,6 +368,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb,
359{ 368{
360 ext4_fsblk_t blk; 369 ext4_fsblk_t blk;
361 struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); 370 struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
371 struct ext4_sb_info *sbi = EXT4_SB(sb);
362 372
363 if (buffer_verified(bh)) 373 if (buffer_verified(bh))
364 return; 374 return;
@@ -369,6 +379,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb,
369 ext4_unlock_group(sb, block_group); 379 ext4_unlock_group(sb, block_group);
370 ext4_error(sb, "bg %u: block %llu: invalid block bitmap", 380 ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
371 block_group, blk); 381 block_group, blk);
382 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
383 percpu_counter_sub(&sbi->s_freeclusters_counter,
384 grp->bb_free);
372 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); 385 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
373 return; 386 return;
374 } 387 }
@@ -376,6 +389,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb,
376 desc, bh))) { 389 desc, bh))) {
377 ext4_unlock_group(sb, block_group); 390 ext4_unlock_group(sb, block_group);
378 ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); 391 ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
392 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
393 percpu_counter_sub(&sbi->s_freeclusters_counter,
394 grp->bb_free);
379 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); 395 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
380 return; 396 return;
381 } 397 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1479e2ae00d2..7cc5a0e23688 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2140,8 +2140,7 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
2140extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 2140extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
2141 struct ext4_map_blocks *map, int flags); 2141 struct ext4_map_blocks *map, int flags);
2142extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, 2142extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2143 const struct iovec *iov, loff_t offset, 2143 struct iov_iter *iter, loff_t offset);
2144 unsigned long nr_segs);
2145extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2144extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2146extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); 2145extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2147extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2146extern void ext4_ind_truncate(handle_t *, struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4e8bc284ec0e..8695f70af1ef 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -74,26 +74,22 @@ static void ext4_unwritten_wait(struct inode *inode)
74 * or one thread will zero the other's data, causing corruption. 74 * or one thread will zero the other's data, causing corruption.
75 */ 75 */
76static int 76static int
77ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, 77ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
78 unsigned long nr_segs, loff_t pos)
79{ 78{
80 struct super_block *sb = inode->i_sb; 79 struct super_block *sb = inode->i_sb;
81 int blockmask = sb->s_blocksize - 1; 80 int blockmask = sb->s_blocksize - 1;
82 size_t count = iov_length(iov, nr_segs);
83 loff_t final_size = pos + count;
84 81
85 if (pos >= i_size_read(inode)) 82 if (pos >= i_size_read(inode))
86 return 0; 83 return 0;
87 84
88 if ((pos & blockmask) || (final_size & blockmask)) 85 if ((pos | iov_iter_alignment(from)) & blockmask)
89 return 1; 86 return 1;
90 87
91 return 0; 88 return 0;
92} 89}
93 90
94static ssize_t 91static ssize_t
95ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 92ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
96 unsigned long nr_segs, loff_t pos)
97{ 93{
98 struct file *file = iocb->ki_filp; 94 struct file *file = iocb->ki_filp;
99 struct inode *inode = file_inode(iocb->ki_filp); 95 struct inode *inode = file_inode(iocb->ki_filp);
@@ -101,10 +97,9 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
101 struct blk_plug plug; 97 struct blk_plug plug;
102 int o_direct = file->f_flags & O_DIRECT; 98 int o_direct = file->f_flags & O_DIRECT;
103 int overwrite = 0; 99 int overwrite = 0;
104 size_t length = iov_length(iov, nr_segs); 100 size_t length = iov_iter_count(from);
105 ssize_t ret; 101 ssize_t ret;
106 102 loff_t pos = iocb->ki_pos;
107 BUG_ON(iocb->ki_pos != pos);
108 103
109 /* 104 /*
110 * Unaligned direct AIO must be serialized; see comment above 105 * Unaligned direct AIO must be serialized; see comment above
@@ -114,7 +109,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
114 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && 109 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
115 !is_sync_kiocb(iocb) && 110 !is_sync_kiocb(iocb) &&
116 (file->f_flags & O_APPEND || 111 (file->f_flags & O_APPEND ||
117 ext4_unaligned_aio(inode, iov, nr_segs, pos))) { 112 ext4_unaligned_aio(inode, from, pos))) {
118 aio_mutex = ext4_aio_mutex(inode); 113 aio_mutex = ext4_aio_mutex(inode);
119 mutex_lock(aio_mutex); 114 mutex_lock(aio_mutex);
120 ext4_unwritten_wait(inode); 115 ext4_unwritten_wait(inode);
@@ -138,10 +133,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
138 goto errout; 133 goto errout;
139 } 134 }
140 135
141 if (pos + length > sbi->s_bitmap_maxbytes) { 136 if (pos + length > sbi->s_bitmap_maxbytes)
142 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, 137 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
143 sbi->s_bitmap_maxbytes - pos);
144 }
145 } 138 }
146 139
147 if (o_direct) { 140 if (o_direct) {
@@ -179,7 +172,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
179 } 172 }
180 } 173 }
181 174
182 ret = __generic_file_aio_write(iocb, iov, nr_segs); 175 ret = __generic_file_write_iter(iocb, from);
183 mutex_unlock(&inode->i_mutex); 176 mutex_unlock(&inode->i_mutex);
184 177
185 if (ret > 0) { 178 if (ret > 0) {
@@ -594,10 +587,10 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
594 587
595const struct file_operations ext4_file_operations = { 588const struct file_operations ext4_file_operations = {
596 .llseek = ext4_llseek, 589 .llseek = ext4_llseek,
597 .read = do_sync_read, 590 .read = new_sync_read,
598 .write = do_sync_write, 591 .write = new_sync_write,
599 .aio_read = generic_file_aio_read, 592 .read_iter = generic_file_read_iter,
600 .aio_write = ext4_file_write, 593 .write_iter = ext4_file_write_iter,
601 .unlocked_ioctl = ext4_ioctl, 594 .unlocked_ioctl = ext4_ioctl,
602#ifdef CONFIG_COMPAT 595#ifdef CONFIG_COMPAT
603 .compat_ioctl = ext4_compat_ioctl, 596 .compat_ioctl = ext4_compat_ioctl,
@@ -607,7 +600,7 @@ const struct file_operations ext4_file_operations = {
607 .release = ext4_release_file, 600 .release = ext4_release_file,
608 .fsync = ext4_sync_file, 601 .fsync = ext4_sync_file,
609 .splice_read = generic_file_splice_read, 602 .splice_read = generic_file_splice_read,
610 .splice_write = generic_file_splice_write, 603 .splice_write = iter_file_splice_write,
611 .fallocate = ext4_fallocate, 604 .fallocate = ext4_fallocate,
612}; 605};
613 606
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 0ee59a6644e2..a87455df38bc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
71 struct ext4_group_desc *gdp) 71 struct ext4_group_desc *gdp)
72{ 72{
73 struct ext4_group_info *grp; 73 struct ext4_group_info *grp;
74 struct ext4_sb_info *sbi = EXT4_SB(sb);
74 J_ASSERT_BH(bh, buffer_locked(bh)); 75 J_ASSERT_BH(bh, buffer_locked(bh));
75 76
76 /* If checksum is bad mark all blocks and inodes use to prevent 77 /* If checksum is bad mark all blocks and inodes use to prevent
@@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
78 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { 79 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
79 ext4_error(sb, "Checksum bad for group %u", block_group); 80 ext4_error(sb, "Checksum bad for group %u", block_group);
80 grp = ext4_get_group_info(sb, block_group); 81 grp = ext4_get_group_info(sb, block_group);
82 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
83 percpu_counter_sub(&sbi->s_freeclusters_counter,
84 grp->bb_free);
81 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); 85 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
86 if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
87 int count;
88 count = ext4_free_inodes_count(sb, gdp);
89 percpu_counter_sub(&sbi->s_freeinodes_counter,
90 count);
91 }
82 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); 92 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
83 return 0; 93 return 0;
84 } 94 }
@@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
116 struct buffer_head *bh = NULL; 126 struct buffer_head *bh = NULL;
117 ext4_fsblk_t bitmap_blk; 127 ext4_fsblk_t bitmap_blk;
118 struct ext4_group_info *grp; 128 struct ext4_group_info *grp;
129 struct ext4_sb_info *sbi = EXT4_SB(sb);
119 130
120 desc = ext4_get_group_desc(sb, block_group, NULL); 131 desc = ext4_get_group_desc(sb, block_group, NULL);
121 if (!desc) 132 if (!desc)
@@ -185,6 +196,12 @@ verify:
185 ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " 196 ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
186 "inode_bitmap = %llu", block_group, bitmap_blk); 197 "inode_bitmap = %llu", block_group, bitmap_blk);
187 grp = ext4_get_group_info(sb, block_group); 198 grp = ext4_get_group_info(sb, block_group);
199 if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
200 int count;
201 count = ext4_free_inodes_count(sb, desc);
202 percpu_counter_sub(&sbi->s_freeinodes_counter,
203 count);
204 }
188 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); 205 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
189 return NULL; 206 return NULL;
190 } 207 }
@@ -321,6 +338,12 @@ out:
321 fatal = err; 338 fatal = err;
322 } else { 339 } else {
323 ext4_error(sb, "bit already cleared for inode %lu", ino); 340 ext4_error(sb, "bit already cleared for inode %lu", ino);
341 if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
342 int count;
343 count = ext4_free_inodes_count(sb, gdp);
344 percpu_counter_sub(&sbi->s_freeinodes_counter,
345 count);
346 }
324 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); 347 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
325 } 348 }
326 349
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 594009f5f523..fd69da194826 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
389 return 0; 389 return 0;
390failed: 390failed:
391 for (; i >= 0; i--) { 391 for (; i >= 0; i--) {
392 if (i != indirect_blks && branch[i].bh) 392 /*
393 * We want to ext4_forget() only freshly allocated indirect
394 * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
395 * buffer at branch[0].bh is indirect block / inode already
396 * existing before ext4_alloc_branch() was called.
397 */
398 if (i > 0 && i != indirect_blks && branch[i].bh)
393 ext4_forget(handle, 1, inode, branch[i].bh, 399 ext4_forget(handle, 1, inode, branch[i].bh,
394 branch[i].bh->b_blocknr); 400 branch[i].bh->b_blocknr);
395 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 401 ext4_free_blocks(handle, inode, NULL, new_blocks[i],
@@ -639,8 +645,7 @@ out:
639 * VFS code falls back into buffered path in that case so we are safe. 645 * VFS code falls back into buffered path in that case so we are safe.
640 */ 646 */
641ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, 647ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
642 const struct iovec *iov, loff_t offset, 648 struct iov_iter *iter, loff_t offset)
643 unsigned long nr_segs)
644{ 649{
645 struct file *file = iocb->ki_filp; 650 struct file *file = iocb->ki_filp;
646 struct inode *inode = file->f_mapping->host; 651 struct inode *inode = file->f_mapping->host;
@@ -648,7 +653,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
648 handle_t *handle; 653 handle_t *handle;
649 ssize_t ret; 654 ssize_t ret;
650 int orphan = 0; 655 int orphan = 0;
651 size_t count = iov_length(iov, nr_segs); 656 size_t count = iov_iter_count(iter);
652 int retries = 0; 657 int retries = 0;
653 658
654 if (rw == WRITE) { 659 if (rw == WRITE) {
@@ -687,18 +692,17 @@ retry:
687 goto locked; 692 goto locked;
688 } 693 }
689 ret = __blockdev_direct_IO(rw, iocb, inode, 694 ret = __blockdev_direct_IO(rw, iocb, inode,
690 inode->i_sb->s_bdev, iov, 695 inode->i_sb->s_bdev, iter, offset,
691 offset, nr_segs,
692 ext4_get_block, NULL, NULL, 0); 696 ext4_get_block, NULL, NULL, 0);
693 inode_dio_done(inode); 697 inode_dio_done(inode);
694 } else { 698 } else {
695locked: 699locked:
696 ret = blockdev_direct_IO(rw, iocb, inode, iov, 700 ret = blockdev_direct_IO(rw, iocb, inode, iter,
697 offset, nr_segs, ext4_get_block); 701 offset, ext4_get_block);
698 702
699 if (unlikely((rw & WRITE) && ret < 0)) { 703 if (unlikely((rw & WRITE) && ret < 0)) {
700 loff_t isize = i_size_read(inode); 704 loff_t isize = i_size_read(inode);
701 loff_t end = offset + iov_length(iov, nr_segs); 705 loff_t end = offset + count;
702 706
703 if (end > isize) 707 if (end > isize)
704 ext4_truncate_failed_write(inode); 708 ext4_truncate_failed_write(inode);
@@ -1312,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,
1312 blk = *i_data; 1316 blk = *i_data;
1313 if (level > 0) { 1317 if (level > 0) {
1314 ext4_lblk_t first2; 1318 ext4_lblk_t first2;
1319 ext4_lblk_t count2;
1320
1315 bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); 1321 bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
1316 if (!bh) { 1322 if (!bh) {
1317 EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), 1323 EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
1318 "Read failure"); 1324 "Read failure");
1319 return -EIO; 1325 return -EIO;
1320 } 1326 }
1321 first2 = (first > offset) ? first - offset : 0; 1327 if (first > offset) {
1328 first2 = first - offset;
1329 count2 = count;
1330 } else {
1331 first2 = 0;
1332 count2 = count - (offset - first);
1333 }
1322 ret = free_hole_blocks(handle, inode, bh, 1334 ret = free_hole_blocks(handle, inode, bh,
1323 (__le32 *)bh->b_data, level - 1, 1335 (__le32 *)bh->b_data, level - 1,
1324 first2, count - offset, 1336 first2, count2,
1325 inode->i_sb->s_blocksize >> 2); 1337 inode->i_sb->s_blocksize >> 2);
1326 if (ret) { 1338 if (ret) {
1327 brelse(bh); 1339 brelse(bh);
@@ -1331,8 +1343,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,
1331 if (level == 0 || 1343 if (level == 0 ||
1332 (bh && all_zeroes((__le32 *)bh->b_data, 1344 (bh && all_zeroes((__le32 *)bh->b_data,
1333 (__le32 *)bh->b_data + addr_per_block))) { 1345 (__le32 *)bh->b_data + addr_per_block))) {
1334 ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); 1346 ext4_free_data(handle, inode, parent_bh,
1335 *i_data = 0; 1347 i_data, i_data + 1);
1336 } 1348 }
1337 brelse(bh); 1349 brelse(bh);
1338 bh = NULL; 1350 bh = NULL;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7fcd68ee9155..8a064734e6eb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3093,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3093 * 3093 *
3094 */ 3094 */
3095static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, 3095static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3096 const struct iovec *iov, loff_t offset, 3096 struct iov_iter *iter, loff_t offset)
3097 unsigned long nr_segs)
3098{ 3097{
3099 struct file *file = iocb->ki_filp; 3098 struct file *file = iocb->ki_filp;
3100 struct inode *inode = file->f_mapping->host; 3099 struct inode *inode = file->f_mapping->host;
3101 ssize_t ret; 3100 ssize_t ret;
3102 size_t count = iov_length(iov, nr_segs); 3101 size_t count = iov_iter_count(iter);
3103 int overwrite = 0; 3102 int overwrite = 0;
3104 get_block_t *get_block_func = NULL; 3103 get_block_t *get_block_func = NULL;
3105 int dio_flags = 0; 3104 int dio_flags = 0;
@@ -3108,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3108 3107
3109 /* Use the old path for reads and writes beyond i_size. */ 3108 /* Use the old path for reads and writes beyond i_size. */
3110 if (rw != WRITE || final_size > inode->i_size) 3109 if (rw != WRITE || final_size > inode->i_size)
3111 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3110 return ext4_ind_direct_IO(rw, iocb, iter, offset);
3112 3111
3113 BUG_ON(iocb->private == NULL); 3112 BUG_ON(iocb->private == NULL);
3114 3113
@@ -3175,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3175 dio_flags = DIO_LOCKING; 3174 dio_flags = DIO_LOCKING;
3176 } 3175 }
3177 ret = __blockdev_direct_IO(rw, iocb, inode, 3176 ret = __blockdev_direct_IO(rw, iocb, inode,
3178 inode->i_sb->s_bdev, iov, 3177 inode->i_sb->s_bdev, iter,
3179 offset, nr_segs, 3178 offset,
3180 get_block_func, 3179 get_block_func,
3181 ext4_end_io_dio, 3180 ext4_end_io_dio,
3182 NULL, 3181 NULL,
@@ -3230,11 +3229,11 @@ retake_lock:
3230} 3229}
3231 3230
3232static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3231static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3233 const struct iovec *iov, loff_t offset, 3232 struct iov_iter *iter, loff_t offset)
3234 unsigned long nr_segs)
3235{ 3233{
3236 struct file *file = iocb->ki_filp; 3234 struct file *file = iocb->ki_filp;
3237 struct inode *inode = file->f_mapping->host; 3235 struct inode *inode = file->f_mapping->host;
3236 size_t count = iov_iter_count(iter);
3238 ssize_t ret; 3237 ssize_t ret;
3239 3238
3240 /* 3239 /*
@@ -3247,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3247 if (ext4_has_inline_data(inode)) 3246 if (ext4_has_inline_data(inode))
3248 return 0; 3247 return 0;
3249 3248
3250 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 3249 trace_ext4_direct_IO_enter(inode, offset, count, rw);
3251 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3250 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3252 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3251 ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
3253 else 3252 else
3254 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3253 ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
3255 trace_ext4_direct_IO_exit(inode, offset, 3254 trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
3256 iov_length(iov, nr_segs), rw, ret);
3257 return ret; 3255 return ret;
3258} 3256}
3259 3257
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 59e31622cc6e..7f72f50a8fa7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
722 void *buddy, void *bitmap, ext4_group_t group) 722 void *buddy, void *bitmap, ext4_group_t group)
723{ 723{
724 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 724 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
725 struct ext4_sb_info *sbi = EXT4_SB(sb);
725 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 726 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
726 ext4_grpblk_t i = 0; 727 ext4_grpblk_t i = 0;
727 ext4_grpblk_t first; 728 ext4_grpblk_t first;
@@ -759,6 +760,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
759 * corrupt and update bb_free using bitmap value 760 * corrupt and update bb_free using bitmap value
760 */ 761 */
761 grp->bb_free = free; 762 grp->bb_free = free;
763 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
764 percpu_counter_sub(&sbi->s_freeclusters_counter,
765 grp->bb_free);
762 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); 766 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
763 } 767 }
764 mb_set_largest_free_order(sb, grp); 768 mb_set_largest_free_order(sb, grp);
@@ -1431,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1431 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); 1435 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1432 1436
1433 if (unlikely(block != -1)) { 1437 if (unlikely(block != -1)) {
1438 struct ext4_sb_info *sbi = EXT4_SB(sb);
1434 ext4_fsblk_t blocknr; 1439 ext4_fsblk_t blocknr;
1435 1440
1436 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1441 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1441,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1441 "freeing already freed block " 1446 "freeing already freed block "
1442 "(bit %u); block bitmap corrupt.", 1447 "(bit %u); block bitmap corrupt.",
1443 block); 1448 block);
1449 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
1450 percpu_counter_sub(&sbi->s_freeclusters_counter,
1451 e4b->bd_info->bb_free);
1444 /* Mark the block group as corrupt. */ 1452 /* Mark the block group as corrupt. */
1445 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, 1453 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1446 &e4b->bd_info->bb_state); 1454 &e4b->bd_info->bb_state);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c1fb6dd10911..0924521306b4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1017,10 +1017,9 @@ static int f2fs_write_end(struct file *file,
1017} 1017}
1018 1018
1019static int check_direct_IO(struct inode *inode, int rw, 1019static int check_direct_IO(struct inode *inode, int rw,
1020 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 1020 struct iov_iter *iter, loff_t offset)
1021{ 1021{
1022 unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; 1022 unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
1023 int i;
1024 1023
1025 if (rw == READ) 1024 if (rw == READ)
1026 return 0; 1025 return 0;
@@ -1028,14 +1027,14 @@ static int check_direct_IO(struct inode *inode, int rw,
1028 if (offset & blocksize_mask) 1027 if (offset & blocksize_mask)
1029 return -EINVAL; 1028 return -EINVAL;
1030 1029
1031 for (i = 0; i < nr_segs; i++) 1030 if (iov_iter_alignment(iter) & blocksize_mask)
1032 if (iov[i].iov_len & blocksize_mask) 1031 return -EINVAL;
1033 return -EINVAL; 1032
1034 return 0; 1033 return 0;
1035} 1034}
1036 1035
1037static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, 1036static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1038 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 1037 struct iov_iter *iter, loff_t offset)
1039{ 1038{
1040 struct file *file = iocb->ki_filp; 1039 struct file *file = iocb->ki_filp;
1041 struct inode *inode = file->f_mapping->host; 1040 struct inode *inode = file->f_mapping->host;
@@ -1044,14 +1043,14 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1044 if (f2fs_has_inline_data(inode)) 1043 if (f2fs_has_inline_data(inode))
1045 return 0; 1044 return 0;
1046 1045
1047 if (check_direct_IO(inode, rw, iov, offset, nr_segs)) 1046 if (check_direct_IO(inode, rw, iter, offset))
1048 return 0; 1047 return 0;
1049 1048
1050 /* clear fsync mark to recover these blocks */ 1049 /* clear fsync mark to recover these blocks */
1051 fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); 1050 fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
1052 1051
1053 return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 1052 return blockdev_direct_IO(rw, iocb, inode, iter, offset,
1054 get_data_block); 1053 get_data_block);
1055} 1054}
1056 1055
1057static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, 1056static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9c49c593d8eb..c58e33075719 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -808,10 +808,10 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
808 808
809const struct file_operations f2fs_file_operations = { 809const struct file_operations f2fs_file_operations = {
810 .llseek = f2fs_llseek, 810 .llseek = f2fs_llseek,
811 .read = do_sync_read, 811 .read = new_sync_read,
812 .write = do_sync_write, 812 .write = new_sync_write,
813 .aio_read = generic_file_aio_read, 813 .read_iter = generic_file_read_iter,
814 .aio_write = generic_file_aio_write, 814 .write_iter = generic_file_write_iter,
815 .open = generic_file_open, 815 .open = generic_file_open,
816 .mmap = f2fs_file_mmap, 816 .mmap = f2fs_file_mmap,
817 .fsync = f2fs_sync_file, 817 .fsync = f2fs_sync_file,
@@ -821,5 +821,5 @@ const struct file_operations f2fs_file_operations = {
821 .compat_ioctl = f2fs_compat_ioctl, 821 .compat_ioctl = f2fs_compat_ioctl,
822#endif 822#endif
823 .splice_read = generic_file_splice_read, 823 .splice_read = generic_file_splice_read,
824 .splice_write = generic_file_splice_write, 824 .splice_write = iter_file_splice_write,
825}; 825};
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 9b104f543056..85f79a89e747 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -170,10 +170,10 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
170 170
171const struct file_operations fat_file_operations = { 171const struct file_operations fat_file_operations = {
172 .llseek = generic_file_llseek, 172 .llseek = generic_file_llseek,
173 .read = do_sync_read, 173 .read = new_sync_read,
174 .write = do_sync_write, 174 .write = new_sync_write,
175 .aio_read = generic_file_aio_read, 175 .read_iter = generic_file_read_iter,
176 .aio_write = generic_file_aio_write, 176 .write_iter = generic_file_write_iter,
177 .mmap = generic_file_mmap, 177 .mmap = generic_file_mmap,
178 .release = fat_file_release, 178 .release = fat_file_release,
179 .unlocked_ioctl = fat_generic_ioctl, 179 .unlocked_ioctl = fat_generic_ioctl,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9c83594d7fb5..756aead10d96 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -247,12 +247,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
247} 247}
248 248
249static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, 249static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
250 const struct iovec *iov, 250 struct iov_iter *iter,
251 loff_t offset, unsigned long nr_segs) 251 loff_t offset)
252{ 252{
253 struct file *file = iocb->ki_filp; 253 struct file *file = iocb->ki_filp;
254 struct address_space *mapping = file->f_mapping; 254 struct address_space *mapping = file->f_mapping;
255 struct inode *inode = mapping->host; 255 struct inode *inode = mapping->host;
256 size_t count = iov_iter_count(iter);
256 ssize_t ret; 257 ssize_t ret;
257 258
258 if (rw == WRITE) { 259 if (rw == WRITE) {
@@ -265,7 +266,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
265 * 266 *
266 * Return 0, and fallback to normal buffered write. 267 * Return 0, and fallback to normal buffered write.
267 */ 268 */
268 loff_t size = offset + iov_length(iov, nr_segs); 269 loff_t size = offset + count;
269 if (MSDOS_I(inode)->mmu_private < size) 270 if (MSDOS_I(inode)->mmu_private < size)
270 return 0; 271 return 0;
271 } 272 }
@@ -274,10 +275,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
274 * FAT need to use the DIO_LOCKING for avoiding the race 275 * FAT need to use the DIO_LOCKING for avoiding the race
275 * condition of fat_get_block() and ->truncate(). 276 * condition of fat_get_block() and ->truncate().
276 */ 277 */
277 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 278 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block);
278 fat_get_block);
279 if (ret < 0 && (rw & WRITE)) 279 if (ret < 0 && (rw & WRITE))
280 fat_write_failed(mapping, offset + iov_length(iov, nr_segs)); 280 fat_write_failed(mapping, offset + count);
281 281
282 return ret; 282 return ret;
283} 283}
diff --git a/fs/file.c b/fs/file.c
index 8f294cfac697..66923fe3176e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -44,15 +44,10 @@ static void *alloc_fdmem(size_t size)
44 return vmalloc(size); 44 return vmalloc(size);
45} 45}
46 46
47static void free_fdmem(void *ptr)
48{
49 is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
50}
51
52static void __free_fdtable(struct fdtable *fdt) 47static void __free_fdtable(struct fdtable *fdt)
53{ 48{
54 free_fdmem(fdt->fd); 49 kvfree(fdt->fd);
55 free_fdmem(fdt->open_fds); 50 kvfree(fdt->open_fds);
56 kfree(fdt); 51 kfree(fdt);
57} 52}
58 53
@@ -130,7 +125,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
130 return fdt; 125 return fdt;
131 126
132out_arr: 127out_arr:
133 free_fdmem(fdt->fd); 128 kvfree(fdt->fd);
134out_fdt: 129out_fdt:
135 kfree(fdt); 130 kfree(fdt);
136out: 131out:
diff --git a/fs/file_table.c b/fs/file_table.c
index 40bf4660f0a3..385bfd31512a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -175,6 +175,12 @@ struct file *alloc_file(struct path *path, fmode_t mode,
175 file->f_path = *path; 175 file->f_path = *path;
176 file->f_inode = path->dentry->d_inode; 176 file->f_inode = path->dentry->d_inode;
177 file->f_mapping = path->dentry->d_inode->i_mapping; 177 file->f_mapping = path->dentry->d_inode->i_mapping;
178 if ((mode & FMODE_READ) &&
179 likely(fop->read || fop->aio_read || fop->read_iter))
180 mode |= FMODE_CAN_READ;
181 if ((mode & FMODE_WRITE) &&
182 likely(fop->write || fop->aio_write || fop->write_iter))
183 mode |= FMODE_CAN_WRITE;
178 file->f_mode = mode; 184 file->f_mode = mode;
179 file->f_op = fop; 185 file->f_op = fop;
180 if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) 186 if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 13b691a8a7d2..966ace8b243f 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -94,8 +94,10 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
94 loff_t pos = 0; 94 loff_t pos = 0;
95 struct iovec iov = { .iov_base = buf, .iov_len = count }; 95 struct iovec iov = { .iov_base = buf, .iov_len = count };
96 struct fuse_io_priv io = { .async = 0, .file = file }; 96 struct fuse_io_priv io = { .async = 0, .file = file };
97 struct iov_iter ii;
98 iov_iter_init(&ii, READ, &iov, 1, count);
97 99
98 return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE); 100 return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_CUSE);
99} 101}
100 102
101static ssize_t cuse_write(struct file *file, const char __user *buf, 103static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -104,12 +106,14 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
104 loff_t pos = 0; 106 loff_t pos = 0;
105 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; 107 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
106 struct fuse_io_priv io = { .async = 0, .file = file }; 108 struct fuse_io_priv io = { .async = 0, .file = file };
109 struct iov_iter ii;
110 iov_iter_init(&ii, WRITE, &iov, 1, count);
107 111
108 /* 112 /*
109 * No locking or generic_write_checks(), the server is 113 * No locking or generic_write_checks(), the server is
110 * responsible for locking and sanity checks. 114 * responsible for locking and sanity checks.
111 */ 115 */
112 return fuse_direct_io(&io, &iov, 1, count, &pos, 116 return fuse_direct_io(&io, &ii, &pos,
113 FUSE_DIO_WRITE | FUSE_DIO_CUSE); 117 FUSE_DIO_WRITE | FUSE_DIO_CUSE);
114} 118}
115 119
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 903cbc9cd6bd..6e16dad13e9b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -933,8 +933,7 @@ out:
933 return err; 933 return err;
934} 934}
935 935
936static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 936static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
937 unsigned long nr_segs, loff_t pos)
938{ 937{
939 struct inode *inode = iocb->ki_filp->f_mapping->host; 938 struct inode *inode = iocb->ki_filp->f_mapping->host;
940 struct fuse_conn *fc = get_fuse_conn(inode); 939 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -945,14 +944,14 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
945 * i_size is up to date). 944 * i_size is up to date).
946 */ 945 */
947 if (fc->auto_inval_data || 946 if (fc->auto_inval_data ||
948 (pos + iov_length(iov, nr_segs) > i_size_read(inode))) { 947 (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
949 int err; 948 int err;
950 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); 949 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
951 if (err) 950 if (err)
952 return err; 951 return err;
953 } 952 }
954 953
955 return generic_file_aio_read(iocb, iov, nr_segs, pos); 954 return generic_file_read_iter(iocb, to);
956} 955}
957 956
958static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, 957static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
@@ -1181,19 +1180,17 @@ static ssize_t fuse_perform_write(struct file *file,
1181 return res > 0 ? res : err; 1180 return res > 0 ? res : err;
1182} 1181}
1183 1182
1184static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 1183static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1185 unsigned long nr_segs, loff_t pos)
1186{ 1184{
1187 struct file *file = iocb->ki_filp; 1185 struct file *file = iocb->ki_filp;
1188 struct address_space *mapping = file->f_mapping; 1186 struct address_space *mapping = file->f_mapping;
1189 size_t count = 0; 1187 size_t count = iov_iter_count(from);
1190 size_t ocount = 0;
1191 ssize_t written = 0; 1188 ssize_t written = 0;
1192 ssize_t written_buffered = 0; 1189 ssize_t written_buffered = 0;
1193 struct inode *inode = mapping->host; 1190 struct inode *inode = mapping->host;
1194 ssize_t err; 1191 ssize_t err;
1195 struct iov_iter i;
1196 loff_t endbyte = 0; 1192 loff_t endbyte = 0;
1193 loff_t pos = iocb->ki_pos;
1197 1194
1198 if (get_fuse_conn(inode)->writeback_cache) { 1195 if (get_fuse_conn(inode)->writeback_cache) {
1199 /* Update size (EOF optimization) and mode (SUID clearing) */ 1196 /* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1201,17 +1198,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1201 if (err) 1198 if (err)
1202 return err; 1199 return err;
1203 1200
1204 return generic_file_aio_write(iocb, iov, nr_segs, pos); 1201 return generic_file_write_iter(iocb, from);
1205 } 1202 }
1206 1203
1207 WARN_ON(iocb->ki_pos != pos);
1208
1209 ocount = 0;
1210 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1211 if (err)
1212 return err;
1213
1214 count = ocount;
1215 mutex_lock(&inode->i_mutex); 1204 mutex_lock(&inode->i_mutex);
1216 1205
1217 /* We can write back this queue in page reclaim */ 1206 /* We can write back this queue in page reclaim */
@@ -1224,6 +1213,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1224 if (count == 0) 1213 if (count == 0)
1225 goto out; 1214 goto out;
1226 1215
1216 iov_iter_truncate(from, count);
1227 err = file_remove_suid(file); 1217 err = file_remove_suid(file);
1228 if (err) 1218 if (err)
1229 goto out; 1219 goto out;
@@ -1233,16 +1223,13 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1233 goto out; 1223 goto out;
1234 1224
1235 if (file->f_flags & O_DIRECT) { 1225 if (file->f_flags & O_DIRECT) {
1236 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 1226 written = generic_file_direct_write(iocb, from, pos);
1237 count, ocount); 1227 if (written < 0 || !iov_iter_count(from))
1238 if (written < 0 || written == count)
1239 goto out; 1228 goto out;
1240 1229
1241 pos += written; 1230 pos += written;
1242 count -= written;
1243 1231
1244 iov_iter_init(&i, iov, nr_segs, count, written); 1232 written_buffered = fuse_perform_write(file, mapping, from, pos);
1245 written_buffered = fuse_perform_write(file, mapping, &i, pos);
1246 if (written_buffered < 0) { 1233 if (written_buffered < 0) {
1247 err = written_buffered; 1234 err = written_buffered;
1248 goto out; 1235 goto out;
@@ -1261,8 +1248,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1261 written += written_buffered; 1248 written += written_buffered;
1262 iocb->ki_pos = pos + written_buffered; 1249 iocb->ki_pos = pos + written_buffered;
1263 } else { 1250 } else {
1264 iov_iter_init(&i, iov, nr_segs, count, 0); 1251 written = fuse_perform_write(file, mapping, from, pos);
1265 written = fuse_perform_write(file, mapping, &i, pos);
1266 if (written >= 0) 1252 if (written >= 0)
1267 iocb->ki_pos = pos + written; 1253 iocb->ki_pos = pos + written;
1268 } 1254 }
@@ -1300,7 +1286,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1300 size_t nbytes = 0; /* # bytes already packed in req */ 1286 size_t nbytes = 0; /* # bytes already packed in req */
1301 1287
1302 /* Special case for kernel I/O: can copy directly into the buffer */ 1288 /* Special case for kernel I/O: can copy directly into the buffer */
1303 if (segment_eq(get_fs(), KERNEL_DS)) { 1289 if (ii->type & ITER_KVEC) {
1304 unsigned long user_addr = fuse_get_user_addr(ii); 1290 unsigned long user_addr = fuse_get_user_addr(ii);
1305 size_t frag_size = fuse_get_frag_size(ii, *nbytesp); 1291 size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1306 1292
@@ -1316,35 +1302,26 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1316 1302
1317 while (nbytes < *nbytesp && req->num_pages < req->max_pages) { 1303 while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
1318 unsigned npages; 1304 unsigned npages;
1319 unsigned long user_addr = fuse_get_user_addr(ii); 1305 size_t start;
1320 unsigned offset = user_addr & ~PAGE_MASK;
1321 size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
1322 int ret;
1323
1324 unsigned n = req->max_pages - req->num_pages; 1306 unsigned n = req->max_pages - req->num_pages;
1325 frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT); 1307 ssize_t ret = iov_iter_get_pages(ii,
1326 1308 &req->pages[req->num_pages],
1327 npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 1309 n * PAGE_SIZE, &start);
1328 npages = clamp(npages, 1U, n);
1329
1330 ret = get_user_pages_fast(user_addr, npages, !write,
1331 &req->pages[req->num_pages]);
1332 if (ret < 0) 1310 if (ret < 0)
1333 return ret; 1311 return ret;
1334 1312
1335 npages = ret; 1313 iov_iter_advance(ii, ret);
1336 frag_size = min_t(size_t, frag_size, 1314 nbytes += ret;
1337 (npages << PAGE_SHIFT) - offset); 1315
1338 iov_iter_advance(ii, frag_size); 1316 ret += start;
1317 npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
1339 1318
1340 req->page_descs[req->num_pages].offset = offset; 1319 req->page_descs[req->num_pages].offset = start;
1341 fuse_page_descs_length_init(req, req->num_pages, npages); 1320 fuse_page_descs_length_init(req, req->num_pages, npages);
1342 1321
1343 req->num_pages += npages; 1322 req->num_pages += npages;
1344 req->page_descs[req->num_pages - 1].length -= 1323 req->page_descs[req->num_pages - 1].length -=
1345 (npages << PAGE_SHIFT) - offset - frag_size; 1324 (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1346
1347 nbytes += frag_size;
1348 } 1325 }
1349 1326
1350 if (write) 1327 if (write)
@@ -1359,24 +1336,11 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1359 1336
1360static inline int fuse_iter_npages(const struct iov_iter *ii_p) 1337static inline int fuse_iter_npages(const struct iov_iter *ii_p)
1361{ 1338{
1362 struct iov_iter ii = *ii_p; 1339 return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
1363 int npages = 0;
1364
1365 while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
1366 unsigned long user_addr = fuse_get_user_addr(&ii);
1367 unsigned offset = user_addr & ~PAGE_MASK;
1368 size_t frag_size = iov_iter_single_seg_count(&ii);
1369
1370 npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
1371 iov_iter_advance(&ii, frag_size);
1372 }
1373
1374 return min(npages, FUSE_MAX_PAGES_PER_REQ);
1375} 1340}
1376 1341
1377ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, 1342ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1378 unsigned long nr_segs, size_t count, loff_t *ppos, 1343 loff_t *ppos, int flags)
1379 int flags)
1380{ 1344{
1381 int write = flags & FUSE_DIO_WRITE; 1345 int write = flags & FUSE_DIO_WRITE;
1382 int cuse = flags & FUSE_DIO_CUSE; 1346 int cuse = flags & FUSE_DIO_CUSE;
@@ -1386,18 +1350,16 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1386 struct fuse_conn *fc = ff->fc; 1350 struct fuse_conn *fc = ff->fc;
1387 size_t nmax = write ? fc->max_write : fc->max_read; 1351 size_t nmax = write ? fc->max_write : fc->max_read;
1388 loff_t pos = *ppos; 1352 loff_t pos = *ppos;
1353 size_t count = iov_iter_count(iter);
1389 pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT; 1354 pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
1390 pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT; 1355 pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
1391 ssize_t res = 0; 1356 ssize_t res = 0;
1392 struct fuse_req *req; 1357 struct fuse_req *req;
1393 struct iov_iter ii;
1394
1395 iov_iter_init(&ii, iov, nr_segs, count, 0);
1396 1358
1397 if (io->async) 1359 if (io->async)
1398 req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii)); 1360 req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
1399 else 1361 else
1400 req = fuse_get_req(fc, fuse_iter_npages(&ii)); 1362 req = fuse_get_req(fc, fuse_iter_npages(iter));
1401 if (IS_ERR(req)) 1363 if (IS_ERR(req))
1402 return PTR_ERR(req); 1364 return PTR_ERR(req);
1403 1365
@@ -1413,7 +1375,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1413 size_t nres; 1375 size_t nres;
1414 fl_owner_t owner = current->files; 1376 fl_owner_t owner = current->files;
1415 size_t nbytes = min(count, nmax); 1377 size_t nbytes = min(count, nmax);
1416 int err = fuse_get_user_pages(req, &ii, &nbytes, write); 1378 int err = fuse_get_user_pages(req, iter, &nbytes, write);
1417 if (err) { 1379 if (err) {
1418 res = err; 1380 res = err;
1419 break; 1381 break;
@@ -1443,9 +1405,9 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1443 fuse_put_request(fc, req); 1405 fuse_put_request(fc, req);
1444 if (io->async) 1406 if (io->async)
1445 req = fuse_get_req_for_background(fc, 1407 req = fuse_get_req_for_background(fc,
1446 fuse_iter_npages(&ii)); 1408 fuse_iter_npages(iter));
1447 else 1409 else
1448 req = fuse_get_req(fc, fuse_iter_npages(&ii)); 1410 req = fuse_get_req(fc, fuse_iter_npages(iter));
1449 if (IS_ERR(req)) 1411 if (IS_ERR(req))
1450 break; 1412 break;
1451 } 1413 }
@@ -1460,9 +1422,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1460EXPORT_SYMBOL_GPL(fuse_direct_io); 1422EXPORT_SYMBOL_GPL(fuse_direct_io);
1461 1423
1462static ssize_t __fuse_direct_read(struct fuse_io_priv *io, 1424static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1463 const struct iovec *iov, 1425 struct iov_iter *iter,
1464 unsigned long nr_segs, loff_t *ppos, 1426 loff_t *ppos)
1465 size_t count)
1466{ 1427{
1467 ssize_t res; 1428 ssize_t res;
1468 struct file *file = io->file; 1429 struct file *file = io->file;
@@ -1471,7 +1432,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1471 if (is_bad_inode(inode)) 1432 if (is_bad_inode(inode))
1472 return -EIO; 1433 return -EIO;
1473 1434
1474 res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0); 1435 res = fuse_direct_io(io, iter, ppos, 0);
1475 1436
1476 fuse_invalidate_attr(inode); 1437 fuse_invalidate_attr(inode);
1477 1438
@@ -1483,22 +1444,26 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf,
1483{ 1444{
1484 struct fuse_io_priv io = { .async = 0, .file = file }; 1445 struct fuse_io_priv io = { .async = 0, .file = file };
1485 struct iovec iov = { .iov_base = buf, .iov_len = count }; 1446 struct iovec iov = { .iov_base = buf, .iov_len = count };
1486 return __fuse_direct_read(&io, &iov, 1, ppos, count); 1447 struct iov_iter ii;
1448 iov_iter_init(&ii, READ, &iov, 1, count);
1449 return __fuse_direct_read(&io, &ii, ppos);
1487} 1450}
1488 1451
1489static ssize_t __fuse_direct_write(struct fuse_io_priv *io, 1452static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
1490 const struct iovec *iov, 1453 struct iov_iter *iter,
1491 unsigned long nr_segs, loff_t *ppos) 1454 loff_t *ppos)
1492{ 1455{
1493 struct file *file = io->file; 1456 struct file *file = io->file;
1494 struct inode *inode = file_inode(file); 1457 struct inode *inode = file_inode(file);
1495 size_t count = iov_length(iov, nr_segs); 1458 size_t count = iov_iter_count(iter);
1496 ssize_t res; 1459 ssize_t res;
1497 1460
1461
1498 res = generic_write_checks(file, ppos, &count, 0); 1462 res = generic_write_checks(file, ppos, &count, 0);
1499 if (!res) 1463 if (!res) {
1500 res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1464 iov_iter_truncate(iter, count);
1501 FUSE_DIO_WRITE); 1465 res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE);
1466 }
1502 1467
1503 fuse_invalidate_attr(inode); 1468 fuse_invalidate_attr(inode);
1504 1469
@@ -1512,13 +1477,15 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
1512 struct inode *inode = file_inode(file); 1477 struct inode *inode = file_inode(file);
1513 ssize_t res; 1478 ssize_t res;
1514 struct fuse_io_priv io = { .async = 0, .file = file }; 1479 struct fuse_io_priv io = { .async = 0, .file = file };
1480 struct iov_iter ii;
1481 iov_iter_init(&ii, WRITE, &iov, 1, count);
1515 1482
1516 if (is_bad_inode(inode)) 1483 if (is_bad_inode(inode))
1517 return -EIO; 1484 return -EIO;
1518 1485
1519 /* Don't allow parallel writes to the same file */ 1486 /* Don't allow parallel writes to the same file */
1520 mutex_lock(&inode->i_mutex); 1487 mutex_lock(&inode->i_mutex);
1521 res = __fuse_direct_write(&io, &iov, 1, ppos); 1488 res = __fuse_direct_write(&io, &ii, ppos);
1522 if (res > 0) 1489 if (res > 0)
1523 fuse_write_update_size(inode, *ppos); 1490 fuse_write_update_size(inode, *ppos);
1524 mutex_unlock(&inode->i_mutex); 1491 mutex_unlock(&inode->i_mutex);
@@ -2372,7 +2339,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
2372 if (!bytes) 2339 if (!bytes)
2373 return 0; 2340 return 0;
2374 2341
2375 iov_iter_init(&ii, iov, nr_segs, bytes, 0); 2342 iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
2376 2343
2377 while (iov_iter_count(&ii)) { 2344 while (iov_iter_count(&ii)) {
2378 struct page *page = pages[page_idx++]; 2345 struct page *page = pages[page_idx++];
@@ -2894,8 +2861,8 @@ static inline loff_t fuse_round_up(loff_t off)
2894} 2861}
2895 2862
2896static ssize_t 2863static ssize_t
2897fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 2864fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
2898 loff_t offset, unsigned long nr_segs) 2865 loff_t offset)
2899{ 2866{
2900 ssize_t ret = 0; 2867 ssize_t ret = 0;
2901 struct file *file = iocb->ki_filp; 2868 struct file *file = iocb->ki_filp;
@@ -2904,7 +2871,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2904 loff_t pos = 0; 2871 loff_t pos = 0;
2905 struct inode *inode; 2872 struct inode *inode;
2906 loff_t i_size; 2873 loff_t i_size;
2907 size_t count = iov_length(iov, nr_segs); 2874 size_t count = iov_iter_count(iter);
2908 struct fuse_io_priv *io; 2875 struct fuse_io_priv *io;
2909 2876
2910 pos = offset; 2877 pos = offset;
@@ -2919,6 +2886,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2919 if (offset >= i_size) 2886 if (offset >= i_size)
2920 return 0; 2887 return 0;
2921 count = min_t(loff_t, count, fuse_round_up(i_size - offset)); 2888 count = min_t(loff_t, count, fuse_round_up(i_size - offset));
2889 iov_iter_truncate(iter, count);
2922 } 2890 }
2923 2891
2924 io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); 2892 io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
@@ -2948,9 +2916,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2948 io->async = false; 2916 io->async = false;
2949 2917
2950 if (rw == WRITE) 2918 if (rw == WRITE)
2951 ret = __fuse_direct_write(io, iov, nr_segs, &pos); 2919 ret = __fuse_direct_write(io, iter, &pos);
2952 else 2920 else
2953 ret = __fuse_direct_read(io, iov, nr_segs, &pos, count); 2921 ret = __fuse_direct_read(io, iter, &pos);
2954 2922
2955 if (io->async) { 2923 if (io->async) {
2956 fuse_aio_complete(io, ret < 0 ? ret : 0, -1); 2924 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
@@ -3061,10 +3029,10 @@ out:
3061 3029
3062static const struct file_operations fuse_file_operations = { 3030static const struct file_operations fuse_file_operations = {
3063 .llseek = fuse_file_llseek, 3031 .llseek = fuse_file_llseek,
3064 .read = do_sync_read, 3032 .read = new_sync_read,
3065 .aio_read = fuse_file_aio_read, 3033 .read_iter = fuse_file_read_iter,
3066 .write = do_sync_write, 3034 .write = new_sync_write,
3067 .aio_write = fuse_file_aio_write, 3035 .write_iter = fuse_file_write_iter,
3068 .mmap = fuse_file_mmap, 3036 .mmap = fuse_file_mmap,
3069 .open = fuse_open, 3037 .open = fuse_open,
3070 .flush = fuse_flush, 3038 .flush = fuse_flush,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7aa5c75e0de1..e8e47a6ab518 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -880,9 +880,8 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
880/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */ 880/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
881#define FUSE_DIO_CUSE (1 << 1) 881#define FUSE_DIO_CUSE (1 << 1)
882 882
883ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, 883ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
884 unsigned long nr_segs, size_t count, loff_t *ppos, 884 loff_t *ppos, int flags);
885 int flags);
886long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, 885long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
887 unsigned int flags); 886 unsigned int flags);
888long fuse_ioctl_common(struct file *file, unsigned int cmd, 887long fuse_ioctl_common(struct file *file, unsigned int cmd,
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 492123cda64a..805b37fed638 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1040,8 +1040,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
1040 1040
1041 1041
1042static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, 1042static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1043 const struct iovec *iov, loff_t offset, 1043 struct iov_iter *iter, loff_t offset)
1044 unsigned long nr_segs)
1045{ 1044{
1046 struct file *file = iocb->ki_filp; 1045 struct file *file = iocb->ki_filp;
1047 struct inode *inode = file->f_mapping->host; 1046 struct inode *inode = file->f_mapping->host;
@@ -1081,7 +1080,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1081 */ 1080 */
1082 if (mapping->nrpages) { 1081 if (mapping->nrpages) {
1083 loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); 1082 loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
1084 loff_t len = iov_length(iov, nr_segs); 1083 loff_t len = iov_iter_count(iter);
1085 loff_t end = PAGE_ALIGN(offset + len) - 1; 1084 loff_t end = PAGE_ALIGN(offset + len) - 1;
1086 1085
1087 rv = 0; 1086 rv = 0;
@@ -1096,9 +1095,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1096 truncate_inode_pages_range(mapping, lstart, end); 1095 truncate_inode_pages_range(mapping, lstart, end);
1097 } 1096 }
1098 1097
1099 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1098 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
1100 offset, nr_segs, gfs2_get_block_direct, 1099 iter, offset,
1101 NULL, NULL, 0); 1100 gfs2_get_block_direct, NULL, NULL, 0);
1102out: 1101out:
1103 gfs2_glock_dq(&gh); 1102 gfs2_glock_dq(&gh);
1104 gfs2_holder_uninit(&gh); 1103 gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6ab0cfb2e891..4fc3a3046174 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -684,7 +684,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
684} 684}
685 685
686/** 686/**
687 * gfs2_file_aio_write - Perform a write to a file 687 * gfs2_file_write_iter - Perform a write to a file
688 * @iocb: The io context 688 * @iocb: The io context
689 * @iov: The data to write 689 * @iov: The data to write
690 * @nr_segs: Number of @iov segments 690 * @nr_segs: Number of @iov segments
@@ -697,11 +697,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
697 * 697 *
698 */ 698 */
699 699
700static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 700static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
701 unsigned long nr_segs, loff_t pos)
702{ 701{
703 struct file *file = iocb->ki_filp; 702 struct file *file = iocb->ki_filp;
704 size_t writesize = iov_length(iov, nr_segs);
705 struct gfs2_inode *ip = GFS2_I(file_inode(file)); 703 struct gfs2_inode *ip = GFS2_I(file_inode(file));
706 int ret; 704 int ret;
707 705
@@ -709,7 +707,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
709 if (ret) 707 if (ret)
710 return ret; 708 return ret;
711 709
712 gfs2_size_hint(file, pos, writesize); 710 gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
713 711
714 if (file->f_flags & O_APPEND) { 712 if (file->f_flags & O_APPEND) {
715 struct gfs2_holder gh; 713 struct gfs2_holder gh;
@@ -720,7 +718,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
720 gfs2_glock_dq_uninit(&gh); 718 gfs2_glock_dq_uninit(&gh);
721 } 719 }
722 720
723 return generic_file_aio_write(iocb, iov, nr_segs, pos); 721 return generic_file_write_iter(iocb, from);
724} 722}
725 723
726static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, 724static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
@@ -1058,10 +1056,10 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
1058 1056
1059const struct file_operations gfs2_file_fops = { 1057const struct file_operations gfs2_file_fops = {
1060 .llseek = gfs2_llseek, 1058 .llseek = gfs2_llseek,
1061 .read = do_sync_read, 1059 .read = new_sync_read,
1062 .aio_read = generic_file_aio_read, 1060 .read_iter = generic_file_read_iter,
1063 .write = do_sync_write, 1061 .write = new_sync_write,
1064 .aio_write = gfs2_file_aio_write, 1062 .write_iter = gfs2_file_write_iter,
1065 .unlocked_ioctl = gfs2_ioctl, 1063 .unlocked_ioctl = gfs2_ioctl,
1066 .mmap = gfs2_mmap, 1064 .mmap = gfs2_mmap,
1067 .open = gfs2_open, 1065 .open = gfs2_open,
@@ -1070,7 +1068,7 @@ const struct file_operations gfs2_file_fops = {
1070 .lock = gfs2_lock, 1068 .lock = gfs2_lock,
1071 .flock = gfs2_flock, 1069 .flock = gfs2_flock,
1072 .splice_read = generic_file_splice_read, 1070 .splice_read = generic_file_splice_read,
1073 .splice_write = generic_file_splice_write, 1071 .splice_write = iter_file_splice_write,
1074 .setlease = gfs2_setlease, 1072 .setlease = gfs2_setlease,
1075 .fallocate = gfs2_fallocate, 1073 .fallocate = gfs2_fallocate,
1076}; 1074};
@@ -1090,17 +1088,17 @@ const struct file_operations gfs2_dir_fops = {
1090 1088
1091const struct file_operations gfs2_file_fops_nolock = { 1089const struct file_operations gfs2_file_fops_nolock = {
1092 .llseek = gfs2_llseek, 1090 .llseek = gfs2_llseek,
1093 .read = do_sync_read, 1091 .read = new_sync_read,
1094 .aio_read = generic_file_aio_read, 1092 .read_iter = generic_file_read_iter,
1095 .write = do_sync_write, 1093 .write = new_sync_write,
1096 .aio_write = gfs2_file_aio_write, 1094 .write_iter = gfs2_file_write_iter,
1097 .unlocked_ioctl = gfs2_ioctl, 1095 .unlocked_ioctl = gfs2_ioctl,
1098 .mmap = gfs2_mmap, 1096 .mmap = gfs2_mmap,
1099 .open = gfs2_open, 1097 .open = gfs2_open,
1100 .release = gfs2_release, 1098 .release = gfs2_release,
1101 .fsync = gfs2_fsync, 1099 .fsync = gfs2_fsync,
1102 .splice_read = generic_file_splice_read, 1100 .splice_read = generic_file_splice_read,
1103 .splice_write = generic_file_splice_write, 1101 .splice_write = iter_file_splice_write,
1104 .setlease = generic_setlease, 1102 .setlease = generic_setlease,
1105 .fallocate = gfs2_fallocate, 1103 .fallocate = gfs2_fallocate,
1106}; 1104};
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9e2fecd62f62..d0929bc81782 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -125,15 +125,15 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
125} 125}
126 126
127static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, 127static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
128 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 128 struct iov_iter *iter, loff_t offset)
129{ 129{
130 struct file *file = iocb->ki_filp; 130 struct file *file = iocb->ki_filp;
131 struct address_space *mapping = file->f_mapping; 131 struct address_space *mapping = file->f_mapping;
132 struct inode *inode = file_inode(file)->i_mapping->host; 132 struct inode *inode = file_inode(file)->i_mapping->host;
133 size_t count = iov_iter_count(iter);
133 ssize_t ret; 134 ssize_t ret;
134 135
135 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 136 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfs_get_block);
136 hfs_get_block);
137 137
138 /* 138 /*
139 * In case of error extending write may have instantiated a few 139 * In case of error extending write may have instantiated a few
@@ -141,7 +141,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
141 */ 141 */
142 if (unlikely((rw & WRITE) && ret < 0)) { 142 if (unlikely((rw & WRITE) && ret < 0)) {
143 loff_t isize = i_size_read(inode); 143 loff_t isize = i_size_read(inode);
144 loff_t end = offset + iov_length(iov, nr_segs); 144 loff_t end = offset + count;
145 145
146 if (end > isize) 146 if (end > isize)
147 hfs_write_failed(mapping, end); 147 hfs_write_failed(mapping, end);
@@ -674,10 +674,10 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
674 674
675static const struct file_operations hfs_file_operations = { 675static const struct file_operations hfs_file_operations = {
676 .llseek = generic_file_llseek, 676 .llseek = generic_file_llseek,
677 .read = do_sync_read, 677 .read = new_sync_read,
678 .aio_read = generic_file_aio_read, 678 .read_iter = generic_file_read_iter,
679 .write = do_sync_write, 679 .write = new_sync_write,
680 .aio_write = generic_file_aio_write, 680 .write_iter = generic_file_write_iter,
681 .mmap = generic_file_mmap, 681 .mmap = generic_file_mmap,
682 .splice_read = generic_file_splice_read, 682 .splice_read = generic_file_splice_read,
683 .fsync = hfs_file_fsync, 683 .fsync = hfs_file_fsync,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a4f45bd88a63..0cf786f2d046 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -123,14 +123,15 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
123} 123}
124 124
125static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, 125static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
126 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 126 struct iov_iter *iter, loff_t offset)
127{ 127{
128 struct file *file = iocb->ki_filp; 128 struct file *file = iocb->ki_filp;
129 struct address_space *mapping = file->f_mapping; 129 struct address_space *mapping = file->f_mapping;
130 struct inode *inode = file_inode(file)->i_mapping->host; 130 struct inode *inode = file_inode(file)->i_mapping->host;
131 size_t count = iov_iter_count(iter);
131 ssize_t ret; 132 ssize_t ret;
132 133
133 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 134 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
134 hfsplus_get_block); 135 hfsplus_get_block);
135 136
136 /* 137 /*
@@ -139,7 +140,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
139 */ 140 */
140 if (unlikely((rw & WRITE) && ret < 0)) { 141 if (unlikely((rw & WRITE) && ret < 0)) {
141 loff_t isize = i_size_read(inode); 142 loff_t isize = i_size_read(inode);
142 loff_t end = offset + iov_length(iov, nr_segs); 143 loff_t end = offset + count;
143 144
144 if (end > isize) 145 if (end > isize)
145 hfsplus_write_failed(mapping, end); 146 hfsplus_write_failed(mapping, end);
@@ -340,10 +341,10 @@ static const struct inode_operations hfsplus_file_inode_operations = {
340 341
341static const struct file_operations hfsplus_file_operations = { 342static const struct file_operations hfsplus_file_operations = {
342 .llseek = generic_file_llseek, 343 .llseek = generic_file_llseek,
343 .read = do_sync_read, 344 .read = new_sync_read,
344 .aio_read = generic_file_aio_read, 345 .read_iter = generic_file_read_iter,
345 .write = do_sync_write, 346 .write = new_sync_write,
346 .aio_write = generic_file_aio_write, 347 .write_iter = generic_file_write_iter,
347 .mmap = generic_file_mmap, 348 .mmap = generic_file_mmap,
348 .splice_read = generic_file_splice_read, 349 .splice_read = generic_file_splice_read,
349 .fsync = hfsplus_file_fsync, 350 .fsync = hfsplus_file_fsync,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 9c470fde9878..bb529f3b7f2b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -378,11 +378,11 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
378 378
379static const struct file_operations hostfs_file_fops = { 379static const struct file_operations hostfs_file_fops = {
380 .llseek = generic_file_llseek, 380 .llseek = generic_file_llseek,
381 .read = do_sync_read, 381 .read = new_sync_read,
382 .splice_read = generic_file_splice_read, 382 .splice_read = generic_file_splice_read,
383 .aio_read = generic_file_aio_read, 383 .read_iter = generic_file_read_iter,
384 .aio_write = generic_file_aio_write, 384 .write_iter = generic_file_write_iter,
385 .write = do_sync_write, 385 .write = new_sync_write,
386 .mmap = generic_file_mmap, 386 .mmap = generic_file_mmap,
387 .open = hostfs_file_open, 387 .open = hostfs_file_open,
388 .release = hostfs_file_release, 388 .release = hostfs_file_release,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 67c1a61e0955..7f54e5f76cec 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -197,10 +197,10 @@ const struct address_space_operations hpfs_aops = {
197const struct file_operations hpfs_file_ops = 197const struct file_operations hpfs_file_ops =
198{ 198{
199 .llseek = generic_file_llseek, 199 .llseek = generic_file_llseek,
200 .read = do_sync_read, 200 .read = new_sync_read,
201 .aio_read = generic_file_aio_read, 201 .read_iter = generic_file_read_iter,
202 .write = do_sync_write, 202 .write = new_sync_write,
203 .aio_write = generic_file_aio_write, 203 .write_iter = generic_file_write_iter,
204 .mmap = generic_file_mmap, 204 .mmap = generic_file_mmap,
205 .release = hpfs_file_release, 205 .release = hpfs_file_release,
206 .fsync = hpfs_file_fsync, 206 .fsync = hpfs_file_fsync,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 256cd19a3b78..64989ca9ba90 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -51,10 +51,10 @@ const struct file_operations jffs2_file_operations =
51{ 51{
52 .llseek = generic_file_llseek, 52 .llseek = generic_file_llseek,
53 .open = generic_file_open, 53 .open = generic_file_open,
54 .read = do_sync_read, 54 .read = new_sync_read,
55 .aio_read = generic_file_aio_read, 55 .read_iter = generic_file_read_iter,
56 .write = do_sync_write, 56 .write = new_sync_write,
57 .aio_write = generic_file_aio_write, 57 .write_iter = generic_file_write_iter,
58 .unlocked_ioctl=jffs2_ioctl, 58 .unlocked_ioctl=jffs2_ioctl,
59 .mmap = generic_file_readonly_mmap, 59 .mmap = generic_file_readonly_mmap,
60 .fsync = jffs2_fsync, 60 .fsync = jffs2_fsync,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 794da944d5cd..33aa0cc1f8b8 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -151,13 +151,13 @@ const struct inode_operations jfs_file_inode_operations = {
151const struct file_operations jfs_file_operations = { 151const struct file_operations jfs_file_operations = {
152 .open = jfs_open, 152 .open = jfs_open,
153 .llseek = generic_file_llseek, 153 .llseek = generic_file_llseek,
154 .write = do_sync_write, 154 .write = new_sync_write,
155 .read = do_sync_read, 155 .read = new_sync_read,
156 .aio_read = generic_file_aio_read, 156 .read_iter = generic_file_read_iter,
157 .aio_write = generic_file_aio_write, 157 .write_iter = generic_file_write_iter,
158 .mmap = generic_file_mmap, 158 .mmap = generic_file_mmap,
159 .splice_read = generic_file_splice_read, 159 .splice_read = generic_file_splice_read,
160 .splice_write = generic_file_splice_write, 160 .splice_write = iter_file_splice_write,
161 .fsync = jfs_fsync, 161 .fsync = jfs_fsync,
162 .release = jfs_release, 162 .release = jfs_release,
163 .unlocked_ioctl = jfs_ioctl, 163 .unlocked_ioctl = jfs_ioctl,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 6f8fe72c2a7a..bd3df1ca3c9b 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -331,15 +331,15 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
331} 331}
332 332
333static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, 333static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
334 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 334 struct iov_iter *iter, loff_t offset)
335{ 335{
336 struct file *file = iocb->ki_filp; 336 struct file *file = iocb->ki_filp;
337 struct address_space *mapping = file->f_mapping; 337 struct address_space *mapping = file->f_mapping;
338 struct inode *inode = file->f_mapping->host; 338 struct inode *inode = file->f_mapping->host;
339 size_t count = iov_iter_count(iter);
339 ssize_t ret; 340 ssize_t ret;
340 341
341 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 342 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, jfs_get_block);
342 jfs_get_block);
343 343
344 /* 344 /*
345 * In case of error extending write may have instantiated a few 345 * In case of error extending write may have instantiated a few
@@ -347,7 +347,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
347 */ 347 */
348 if (unlikely((rw & WRITE) && ret < 0)) { 348 if (unlikely((rw & WRITE) && ret < 0)) {
349 loff_t isize = i_size_read(inode); 349 loff_t isize = i_size_read(inode);
350 loff_t end = offset + iov_length(iov, nr_segs); 350 loff_t end = offset + count;
351 351
352 if (end > isize) 352 if (end > isize)
353 jfs_write_failed(mapping, end); 353 jfs_write_failed(mapping, end);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e3d37f607f97..d895b4b7b661 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -39,6 +39,19 @@ struct kernfs_open_node {
39 struct list_head files; /* goes through kernfs_open_file.list */ 39 struct list_head files; /* goes through kernfs_open_file.list */
40}; 40};
41 41
42/*
43 * kernfs_notify() may be called from any context and bounces notifications
44 * through a work item. To minimize space overhead in kernfs_node, the
45 * pending queue is implemented as a singly linked list of kernfs_nodes.
46 * The list is terminated with the self pointer so that whether a
47 * kernfs_node is on the list or not can be determined by testing the next
48 * pointer for NULL.
49 */
50#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
51
52static DEFINE_SPINLOCK(kernfs_notify_lock);
53static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
54
42static struct kernfs_open_file *kernfs_of(struct file *file) 55static struct kernfs_open_file *kernfs_of(struct file *file)
43{ 56{
44 return ((struct seq_file *)file->private_data)->private; 57 return ((struct seq_file *)file->private_data)->private;
@@ -783,24 +796,25 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
783 return DEFAULT_POLLMASK|POLLERR|POLLPRI; 796 return DEFAULT_POLLMASK|POLLERR|POLLPRI;
784} 797}
785 798
786/** 799static void kernfs_notify_workfn(struct work_struct *work)
787 * kernfs_notify - notify a kernfs file
788 * @kn: file to notify
789 *
790 * Notify @kn such that poll(2) on @kn wakes up.
791 */
792void kernfs_notify(struct kernfs_node *kn)
793{ 800{
794 struct kernfs_root *root = kernfs_root(kn); 801 struct kernfs_node *kn;
795 struct kernfs_open_node *on; 802 struct kernfs_open_node *on;
796 struct kernfs_super_info *info; 803 struct kernfs_super_info *info;
797 unsigned long flags; 804repeat:
798 805 /* pop one off the notify_list */
799 if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) 806 spin_lock_irq(&kernfs_notify_lock);
807 kn = kernfs_notify_list;
808 if (kn == KERNFS_NOTIFY_EOL) {
809 spin_unlock_irq(&kernfs_notify_lock);
800 return; 810 return;
811 }
812 kernfs_notify_list = kn->attr.notify_next;
813 kn->attr.notify_next = NULL;
814 spin_unlock_irq(&kernfs_notify_lock);
801 815
802 /* kick poll */ 816 /* kick poll */
803 spin_lock_irqsave(&kernfs_open_node_lock, flags); 817 spin_lock_irq(&kernfs_open_node_lock);
804 818
805 on = kn->attr.open; 819 on = kn->attr.open;
806 if (on) { 820 if (on) {
@@ -808,12 +822,12 @@ void kernfs_notify(struct kernfs_node *kn)
808 wake_up_interruptible(&on->poll); 822 wake_up_interruptible(&on->poll);
809 } 823 }
810 824
811 spin_unlock_irqrestore(&kernfs_open_node_lock, flags); 825 spin_unlock_irq(&kernfs_open_node_lock);
812 826
813 /* kick fsnotify */ 827 /* kick fsnotify */
814 mutex_lock(&kernfs_mutex); 828 mutex_lock(&kernfs_mutex);
815 829
816 list_for_each_entry(info, &root->supers, node) { 830 list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
817 struct inode *inode; 831 struct inode *inode;
818 struct dentry *dentry; 832 struct dentry *dentry;
819 833
@@ -833,6 +847,33 @@ void kernfs_notify(struct kernfs_node *kn)
833 } 847 }
834 848
835 mutex_unlock(&kernfs_mutex); 849 mutex_unlock(&kernfs_mutex);
850 kernfs_put(kn);
851 goto repeat;
852}
853
854/**
855 * kernfs_notify - notify a kernfs file
856 * @kn: file to notify
857 *
858 * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any
859 * context.
860 */
861void kernfs_notify(struct kernfs_node *kn)
862{
863 static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
864 unsigned long flags;
865
866 if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
867 return;
868
869 spin_lock_irqsave(&kernfs_notify_lock, flags);
870 if (!kn->attr.notify_next) {
871 kernfs_get(kn);
872 kn->attr.notify_next = kernfs_notify_list;
873 kernfs_notify_list = kn;
874 schedule_work(&kernfs_notify_work);
875 }
876 spin_unlock_irqrestore(&kernfs_notify_lock, flags);
836} 877}
837EXPORT_SYMBOL_GPL(kernfs_notify); 878EXPORT_SYMBOL_GPL(kernfs_notify);
838 879
diff --git a/fs/locks.c b/fs/locks.c
index da57c9b7e844..717fbc404e6b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -431,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
431 if (assign_type(fl, type) != 0) 431 if (assign_type(fl, type) != 0)
432 return -EINVAL; 432 return -EINVAL;
433 433
434 fl->fl_owner = (fl_owner_t)filp; 434 fl->fl_owner = (fl_owner_t)current->files;
435 fl->fl_pid = current->tgid; 435 fl->fl_pid = current->tgid;
436 436
437 fl->fl_file = filp; 437 fl->fl_file = filp;
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 57914fc32b62..8538752df2f6 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -264,15 +264,15 @@ const struct inode_operations logfs_reg_iops = {
264}; 264};
265 265
266const struct file_operations logfs_reg_fops = { 266const struct file_operations logfs_reg_fops = {
267 .aio_read = generic_file_aio_read, 267 .read_iter = generic_file_read_iter,
268 .aio_write = generic_file_aio_write, 268 .write_iter = generic_file_write_iter,
269 .fsync = logfs_fsync, 269 .fsync = logfs_fsync,
270 .unlocked_ioctl = logfs_ioctl, 270 .unlocked_ioctl = logfs_ioctl,
271 .llseek = generic_file_llseek, 271 .llseek = generic_file_llseek,
272 .mmap = generic_file_readonly_mmap, 272 .mmap = generic_file_readonly_mmap,
273 .open = generic_file_open, 273 .open = generic_file_open,
274 .read = do_sync_read, 274 .read = new_sync_read,
275 .write = do_sync_write, 275 .write = new_sync_write,
276}; 276};
277 277
278const struct address_space_operations logfs_reg_aops = { 278const struct address_space_operations logfs_reg_aops = {
diff --git a/fs/mbcache.c b/fs/mbcache.c
index bf166e388f0d..187477ded6b3 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -73,6 +73,7 @@
73#include <linux/mbcache.h> 73#include <linux/mbcache.h>
74#include <linux/init.h> 74#include <linux/init.h>
75#include <linux/blockgroup_lock.h> 75#include <linux/blockgroup_lock.h>
76#include <linux/log2.h>
76 77
77#ifdef MB_CACHE_DEBUG 78#ifdef MB_CACHE_DEBUG
78# define mb_debug(f...) do { \ 79# define mb_debug(f...) do { \
@@ -93,7 +94,7 @@
93 94
94#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) 95#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
95 96
96#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) 97#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS)
97#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ 98#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
98 (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) 99 (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
99 100
diff --git a/fs/minix/file.c b/fs/minix/file.c
index adc6f5494231..a967de085ac0 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -14,10 +14,10 @@
14 */ 14 */
15const struct file_operations minix_file_operations = { 15const struct file_operations minix_file_operations = {
16 .llseek = generic_file_llseek, 16 .llseek = generic_file_llseek,
17 .read = do_sync_read, 17 .read = new_sync_read,
18 .aio_read = generic_file_aio_read, 18 .read_iter = generic_file_read_iter,
19 .write = do_sync_write, 19 .write = new_sync_write,
20 .aio_write = generic_file_aio_write, 20 .write_iter = generic_file_write_iter,
21 .mmap = generic_file_mmap, 21 .mmap = generic_file_mmap,
22 .fsync = generic_file_fsync, 22 .fsync = generic_file_fsync,
23 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4ad7bc388679..8f98138cbc43 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -212,20 +212,20 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
212 * shunt off direct read and write requests before the VFS gets them, 212 * shunt off direct read and write requests before the VFS gets them,
213 * so this method is only ever called for swap. 213 * so this method is only ever called for swap.
214 */ 214 */
215ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) 215ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
216{ 216{
217#ifndef CONFIG_NFS_SWAP 217#ifndef CONFIG_NFS_SWAP
218 dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", 218 dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
219 iocb->ki_filp, (long long) pos, nr_segs); 219 iocb->ki_filp, (long long) pos, iter->nr_segs);
220 220
221 return -EINVAL; 221 return -EINVAL;
222#else 222#else
223 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); 223 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
224 224
225 if (rw == READ || rw == KERNEL_READ) 225 if (rw == READ || rw == KERNEL_READ)
226 return nfs_file_direct_read(iocb, iov, nr_segs, pos, 226 return nfs_file_direct_read(iocb, iter, pos,
227 rw == READ ? true : false); 227 rw == READ ? true : false);
228 return nfs_file_direct_write(iocb, iov, nr_segs, pos, 228 return nfs_file_direct_write(iocb, iter, pos,
229 rw == WRITE ? true : false); 229 rw == WRITE ? true : false);
230#endif /* CONFIG_NFS_SWAP */ 230#endif /* CONFIG_NFS_SWAP */
231} 231}
@@ -414,60 +414,37 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
414 * handled automatically by nfs_direct_read_result(). Otherwise, if 414 * handled automatically by nfs_direct_read_result(). Otherwise, if
415 * no requests have been sent, just return an error. 415 * no requests have been sent, just return an error.
416 */ 416 */
417static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
418 const struct iovec *iov,
419 loff_t pos, bool uio)
420{
421 struct nfs_direct_req *dreq = desc->pg_dreq;
422 struct nfs_open_context *ctx = dreq->ctx;
423 struct inode *inode = ctx->dentry->d_inode;
424 unsigned long user_addr = (unsigned long)iov->iov_base;
425 size_t count = iov->iov_len;
426 size_t rsize = NFS_SERVER(inode)->rsize;
427 unsigned int pgbase;
428 int result;
429 ssize_t started = 0;
430 struct page **pagevec = NULL;
431 unsigned int npages;
432
433 do {
434 size_t bytes;
435 int i;
436 417
437 pgbase = user_addr & ~PAGE_MASK; 418static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
438 bytes = min(max_t(size_t, rsize, PAGE_SIZE), count); 419 struct iov_iter *iter,
420 loff_t pos)
421{
422 struct nfs_pageio_descriptor desc;
423 struct inode *inode = dreq->inode;
424 ssize_t result = -EINVAL;
425 size_t requested_bytes = 0;
426 size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
439 427
440 result = -ENOMEM; 428 nfs_pageio_init_read(&desc, dreq->inode, false,
441 npages = nfs_page_array_len(pgbase, bytes); 429 &nfs_direct_read_completion_ops);
442 if (!pagevec) 430 get_dreq(dreq);
443 pagevec = kmalloc(npages * sizeof(struct page *), 431 desc.pg_dreq = dreq;
444 GFP_KERNEL); 432 atomic_inc(&inode->i_dio_count);
445 if (!pagevec)
446 break;
447 if (uio) {
448 down_read(&current->mm->mmap_sem);
449 result = get_user_pages(current, current->mm, user_addr,
450 npages, 1, 0, pagevec, NULL);
451 up_read(&current->mm->mmap_sem);
452 if (result < 0)
453 break;
454 } else {
455 WARN_ON(npages != 1);
456 result = get_kernel_page(user_addr, 1, pagevec);
457 if (WARN_ON(result != 1))
458 break;
459 }
460 433
461 if ((unsigned)result < npages) { 434 while (iov_iter_count(iter)) {
462 bytes = result * PAGE_SIZE; 435 struct page **pagevec;
463 if (bytes <= pgbase) { 436 size_t bytes;
464 nfs_direct_release_pages(pagevec, result); 437 size_t pgbase;
465 break; 438 unsigned npages, i;
466 }
467 bytes -= pgbase;
468 npages = result;
469 }
470 439
440 result = iov_iter_get_pages_alloc(iter, &pagevec,
441 rsize, &pgbase);
442 if (result < 0)
443 break;
444
445 bytes = result;
446 iov_iter_advance(iter, bytes);
447 npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
471 for (i = 0; i < npages; i++) { 448 for (i = 0; i < npages; i++) {
472 struct nfs_page *req; 449 struct nfs_page *req;
473 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); 450 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
@@ -480,56 +457,21 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
480 } 457 }
481 req->wb_index = pos >> PAGE_SHIFT; 458 req->wb_index = pos >> PAGE_SHIFT;
482 req->wb_offset = pos & ~PAGE_MASK; 459 req->wb_offset = pos & ~PAGE_MASK;
483 if (!nfs_pageio_add_request(desc, req)) { 460 if (!nfs_pageio_add_request(&desc, req)) {
484 result = desc->pg_error; 461 result = desc.pg_error;
485 nfs_release_request(req); 462 nfs_release_request(req);
486 break; 463 break;
487 } 464 }
488 pgbase = 0; 465 pgbase = 0;
489 bytes -= req_len; 466 bytes -= req_len;
490 started += req_len; 467 requested_bytes += req_len;
491 user_addr += req_len;
492 pos += req_len; 468 pos += req_len;
493 count -= req_len;
494 dreq->bytes_left -= req_len; 469 dreq->bytes_left -= req_len;
495 } 470 }
496 /* The nfs_page now hold references to these pages */
497 nfs_direct_release_pages(pagevec, npages); 471 nfs_direct_release_pages(pagevec, npages);
498 } while (count != 0 && result >= 0); 472 kvfree(pagevec);
499
500 kfree(pagevec);
501
502 if (started)
503 return started;
504 return result < 0 ? (ssize_t) result : -EFAULT;
505}
506
507static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
508 const struct iovec *iov,
509 unsigned long nr_segs,
510 loff_t pos, bool uio)
511{
512 struct nfs_pageio_descriptor desc;
513 struct inode *inode = dreq->inode;
514 ssize_t result = -EINVAL;
515 size_t requested_bytes = 0;
516 unsigned long seg;
517
518 nfs_pageio_init_read(&desc, dreq->inode, false,
519 &nfs_direct_read_completion_ops);
520 get_dreq(dreq);
521 desc.pg_dreq = dreq;
522 atomic_inc(&inode->i_dio_count);
523
524 for (seg = 0; seg < nr_segs; seg++) {
525 const struct iovec *vec = &iov[seg];
526 result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
527 if (result < 0) 473 if (result < 0)
528 break; 474 break;
529 requested_bytes += result;
530 if ((size_t)result < vec->iov_len)
531 break;
532 pos += vec->iov_len;
533 } 475 }
534 476
535 nfs_pageio_complete(&desc); 477 nfs_pageio_complete(&desc);
@@ -552,8 +494,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
552/** 494/**
553 * nfs_file_direct_read - file direct read operation for NFS files 495 * nfs_file_direct_read - file direct read operation for NFS files
554 * @iocb: target I/O control block 496 * @iocb: target I/O control block
555 * @iov: vector of user buffers into which to read data 497 * @iter: vector of user buffers into which to read data
556 * @nr_segs: size of iov vector
557 * @pos: byte offset in file where reading starts 498 * @pos: byte offset in file where reading starts
558 * 499 *
559 * We use this function for direct reads instead of calling 500 * We use this function for direct reads instead of calling
@@ -570,8 +511,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
570 * client must read the updated atime from the server back into its 511 * client must read the updated atime from the server back into its
571 * cache. 512 * cache.
572 */ 513 */
573ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, 514ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
574 unsigned long nr_segs, loff_t pos, bool uio) 515 loff_t pos, bool uio)
575{ 516{
576 struct file *file = iocb->ki_filp; 517 struct file *file = iocb->ki_filp;
577 struct address_space *mapping = file->f_mapping; 518 struct address_space *mapping = file->f_mapping;
@@ -579,9 +520,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
579 struct nfs_direct_req *dreq; 520 struct nfs_direct_req *dreq;
580 struct nfs_lock_context *l_ctx; 521 struct nfs_lock_context *l_ctx;
581 ssize_t result = -EINVAL; 522 ssize_t result = -EINVAL;
582 size_t count; 523 size_t count = iov_iter_count(iter);
583
584 count = iov_length(iov, nr_segs);
585 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); 524 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
586 525
587 dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", 526 dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
@@ -604,7 +543,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
604 goto out_unlock; 543 goto out_unlock;
605 544
606 dreq->inode = inode; 545 dreq->inode = inode;
607 dreq->bytes_left = iov_length(iov, nr_segs); 546 dreq->bytes_left = count;
608 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 547 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
609 l_ctx = nfs_get_lock_context(dreq->ctx); 548 l_ctx = nfs_get_lock_context(dreq->ctx);
610 if (IS_ERR(l_ctx)) { 549 if (IS_ERR(l_ctx)) {
@@ -615,8 +554,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
615 if (!is_sync_kiocb(iocb)) 554 if (!is_sync_kiocb(iocb))
616 dreq->iocb = iocb; 555 dreq->iocb = iocb;
617 556
618 NFS_I(inode)->read_io += iov_length(iov, nr_segs); 557 NFS_I(inode)->read_io += count;
619 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); 558 result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
620 559
621 mutex_unlock(&inode->i_mutex); 560 mutex_unlock(&inode->i_mutex);
622 561
@@ -772,108 +711,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
772} 711}
773#endif 712#endif
774 713
775/*
776 * NB: Return the value of the first error return code. Subsequent
777 * errors after the first one are ignored.
778 */
779/*
780 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
781 * operation. If nfs_writedata_alloc() or get_user_pages() fails,
782 * bail and stop sending more writes. Write length accounting is
783 * handled automatically by nfs_direct_write_result(). Otherwise, if
784 * no requests have been sent, just return an error.
785 */
786static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
787 const struct iovec *iov,
788 loff_t pos, bool uio)
789{
790 struct nfs_direct_req *dreq = desc->pg_dreq;
791 struct nfs_open_context *ctx = dreq->ctx;
792 struct inode *inode = ctx->dentry->d_inode;
793 unsigned long user_addr = (unsigned long)iov->iov_base;
794 size_t count = iov->iov_len;
795 size_t wsize = NFS_SERVER(inode)->wsize;
796 unsigned int pgbase;
797 int result;
798 ssize_t started = 0;
799 struct page **pagevec = NULL;
800 unsigned int npages;
801
802 do {
803 size_t bytes;
804 int i;
805
806 pgbase = user_addr & ~PAGE_MASK;
807 bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
808
809 result = -ENOMEM;
810 npages = nfs_page_array_len(pgbase, bytes);
811 if (!pagevec)
812 pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
813 if (!pagevec)
814 break;
815
816 if (uio) {
817 down_read(&current->mm->mmap_sem);
818 result = get_user_pages(current, current->mm, user_addr,
819 npages, 0, 0, pagevec, NULL);
820 up_read(&current->mm->mmap_sem);
821 if (result < 0)
822 break;
823 } else {
824 WARN_ON(npages != 1);
825 result = get_kernel_page(user_addr, 0, pagevec);
826 if (WARN_ON(result != 1))
827 break;
828 }
829
830 if ((unsigned)result < npages) {
831 bytes = result * PAGE_SIZE;
832 if (bytes <= pgbase) {
833 nfs_direct_release_pages(pagevec, result);
834 break;
835 }
836 bytes -= pgbase;
837 npages = result;
838 }
839
840 for (i = 0; i < npages; i++) {
841 struct nfs_page *req;
842 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
843
844 req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
845 pgbase, req_len);
846 if (IS_ERR(req)) {
847 result = PTR_ERR(req);
848 break;
849 }
850 nfs_lock_request(req);
851 req->wb_index = pos >> PAGE_SHIFT;
852 req->wb_offset = pos & ~PAGE_MASK;
853 if (!nfs_pageio_add_request(desc, req)) {
854 result = desc->pg_error;
855 nfs_unlock_and_release_request(req);
856 break;
857 }
858 pgbase = 0;
859 bytes -= req_len;
860 started += req_len;
861 user_addr += req_len;
862 pos += req_len;
863 count -= req_len;
864 dreq->bytes_left -= req_len;
865 }
866 /* The nfs_page now hold references to these pages */
867 nfs_direct_release_pages(pagevec, npages);
868 } while (count != 0 && result >= 0);
869
870 kfree(pagevec);
871
872 if (started)
873 return started;
874 return result < 0 ? (ssize_t) result : -EFAULT;
875}
876
877static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) 714static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
878{ 715{
879 struct nfs_direct_req *dreq = hdr->dreq; 716 struct nfs_direct_req *dreq = hdr->dreq;
@@ -956,16 +793,27 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
956 .completion = nfs_direct_write_completion, 793 .completion = nfs_direct_write_completion,
957}; 794};
958 795
796
797/*
798 * NB: Return the value of the first error return code. Subsequent
799 * errors after the first one are ignored.
800 */
801/*
802 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
803 * operation. If nfs_writedata_alloc() or get_user_pages() fails,
804 * bail and stop sending more writes. Write length accounting is
805 * handled automatically by nfs_direct_write_result(). Otherwise, if
806 * no requests have been sent, just return an error.
807 */
959static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, 808static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
960 const struct iovec *iov, 809 struct iov_iter *iter,
961 unsigned long nr_segs, 810 loff_t pos)
962 loff_t pos, bool uio)
963{ 811{
964 struct nfs_pageio_descriptor desc; 812 struct nfs_pageio_descriptor desc;
965 struct inode *inode = dreq->inode; 813 struct inode *inode = dreq->inode;
966 ssize_t result = 0; 814 ssize_t result = 0;
967 size_t requested_bytes = 0; 815 size_t requested_bytes = 0;
968 unsigned long seg; 816 size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
969 817
970 nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, 818 nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
971 &nfs_direct_write_completion_ops); 819 &nfs_direct_write_completion_ops);
@@ -973,16 +821,49 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
973 get_dreq(dreq); 821 get_dreq(dreq);
974 atomic_inc(&inode->i_dio_count); 822 atomic_inc(&inode->i_dio_count);
975 823
976 NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs); 824 NFS_I(inode)->write_io += iov_iter_count(iter);
977 for (seg = 0; seg < nr_segs; seg++) { 825 while (iov_iter_count(iter)) {
978 const struct iovec *vec = &iov[seg]; 826 struct page **pagevec;
979 result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); 827 size_t bytes;
828 size_t pgbase;
829 unsigned npages, i;
830
831 result = iov_iter_get_pages_alloc(iter, &pagevec,
832 wsize, &pgbase);
980 if (result < 0) 833 if (result < 0)
981 break; 834 break;
982 requested_bytes += result; 835
983 if ((size_t)result < vec->iov_len) 836 bytes = result;
837 iov_iter_advance(iter, bytes);
838 npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
839 for (i = 0; i < npages; i++) {
840 struct nfs_page *req;
841 unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
842
843 req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
844 pgbase, req_len);
845 if (IS_ERR(req)) {
846 result = PTR_ERR(req);
847 break;
848 }
849 nfs_lock_request(req);
850 req->wb_index = pos >> PAGE_SHIFT;
851 req->wb_offset = pos & ~PAGE_MASK;
852 if (!nfs_pageio_add_request(&desc, req)) {
853 result = desc.pg_error;
854 nfs_unlock_and_release_request(req);
855 break;
856 }
857 pgbase = 0;
858 bytes -= req_len;
859 requested_bytes += req_len;
860 pos += req_len;
861 dreq->bytes_left -= req_len;
862 }
863 nfs_direct_release_pages(pagevec, npages);
864 kvfree(pagevec);
865 if (result < 0)
984 break; 866 break;
985 pos += vec->iov_len;
986 } 867 }
987 nfs_pageio_complete(&desc); 868 nfs_pageio_complete(&desc);
988 869
@@ -1004,8 +885,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
1004/** 885/**
1005 * nfs_file_direct_write - file direct write operation for NFS files 886 * nfs_file_direct_write - file direct write operation for NFS files
1006 * @iocb: target I/O control block 887 * @iocb: target I/O control block
1007 * @iov: vector of user buffers from which to write data 888 * @iter: vector of user buffers from which to write data
1008 * @nr_segs: size of iov vector
1009 * @pos: byte offset in file where writing starts 889 * @pos: byte offset in file where writing starts
1010 * 890 *
1011 * We use this function for direct writes instead of calling 891 * We use this function for direct writes instead of calling
@@ -1023,8 +903,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
1023 * Note that O_APPEND is not supported for NFS direct writes, as there 903 * Note that O_APPEND is not supported for NFS direct writes, as there
1024 * is no atomic O_APPEND write facility in the NFS protocol. 904 * is no atomic O_APPEND write facility in the NFS protocol.
1025 */ 905 */
1026ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 906ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
1027 unsigned long nr_segs, loff_t pos, bool uio) 907 loff_t pos, bool uio)
1028{ 908{
1029 ssize_t result = -EINVAL; 909 ssize_t result = -EINVAL;
1030 struct file *file = iocb->ki_filp; 910 struct file *file = iocb->ki_filp;
@@ -1033,9 +913,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1033 struct nfs_direct_req *dreq; 913 struct nfs_direct_req *dreq;
1034 struct nfs_lock_context *l_ctx; 914 struct nfs_lock_context *l_ctx;
1035 loff_t end; 915 loff_t end;
1036 size_t count; 916 size_t count = iov_iter_count(iter);
1037
1038 count = iov_length(iov, nr_segs);
1039 end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 917 end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
1040 918
1041 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); 919 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
@@ -1086,7 +964,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1086 if (!is_sync_kiocb(iocb)) 964 if (!is_sync_kiocb(iocb))
1087 dreq->iocb = iocb; 965 dreq->iocb = iocb;
1088 966
1089 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); 967 result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
1090 968
1091 if (mapping->nrpages) { 969 if (mapping->nrpages) {
1092 invalidate_inode_pages2_range(mapping, 970 invalidate_inode_pages2_range(mapping,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c1edf7336315..4042ff58fe3f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -165,22 +165,21 @@ nfs_file_flush(struct file *file, fl_owner_t id)
165EXPORT_SYMBOL_GPL(nfs_file_flush); 165EXPORT_SYMBOL_GPL(nfs_file_flush);
166 166
167ssize_t 167ssize_t
168nfs_file_read(struct kiocb *iocb, const struct iovec *iov, 168nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
169 unsigned long nr_segs, loff_t pos)
170{ 169{
171 struct inode *inode = file_inode(iocb->ki_filp); 170 struct inode *inode = file_inode(iocb->ki_filp);
172 ssize_t result; 171 ssize_t result;
173 172
174 if (iocb->ki_filp->f_flags & O_DIRECT) 173 if (iocb->ki_filp->f_flags & O_DIRECT)
175 return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); 174 return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
176 175
177 dprintk("NFS: read(%pD2, %lu@%lu)\n", 176 dprintk("NFS: read(%pD2, %zu@%lu)\n",
178 iocb->ki_filp, 177 iocb->ki_filp,
179 (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); 178 iov_iter_count(to), (unsigned long) iocb->ki_pos);
180 179
181 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 180 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
182 if (!result) { 181 if (!result) {
183 result = generic_file_aio_read(iocb, iov, nr_segs, pos); 182 result = generic_file_read_iter(iocb, to);
184 if (result > 0) 183 if (result > 0)
185 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); 184 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
186 } 185 }
@@ -635,24 +634,24 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
635 return 0; 634 return 0;
636} 635}
637 636
638ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, 637ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
639 unsigned long nr_segs, loff_t pos)
640{ 638{
641 struct file *file = iocb->ki_filp; 639 struct file *file = iocb->ki_filp;
642 struct inode *inode = file_inode(file); 640 struct inode *inode = file_inode(file);
643 unsigned long written = 0; 641 unsigned long written = 0;
644 ssize_t result; 642 ssize_t result;
645 size_t count = iov_length(iov, nr_segs); 643 size_t count = iov_iter_count(from);
644 loff_t pos = iocb->ki_pos;
646 645
647 result = nfs_key_timeout_notify(file, inode); 646 result = nfs_key_timeout_notify(file, inode);
648 if (result) 647 if (result)
649 return result; 648 return result;
650 649
651 if (file->f_flags & O_DIRECT) 650 if (file->f_flags & O_DIRECT)
652 return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); 651 return nfs_file_direct_write(iocb, from, pos, true);
653 652
654 dprintk("NFS: write(%pD2, %lu@%Ld)\n", 653 dprintk("NFS: write(%pD2, %zu@%Ld)\n",
655 file, (unsigned long) count, (long long) pos); 654 file, count, (long long) pos);
656 655
657 result = -EBUSY; 656 result = -EBUSY;
658 if (IS_SWAPFILE(inode)) 657 if (IS_SWAPFILE(inode))
@@ -670,7 +669,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
670 if (!count) 669 if (!count)
671 goto out; 670 goto out;
672 671
673 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 672 result = generic_file_write_iter(iocb, from);
674 if (result > 0) 673 if (result > 0)
675 written = result; 674 written = result;
676 675
@@ -691,36 +690,6 @@ out_swapfile:
691} 690}
692EXPORT_SYMBOL_GPL(nfs_file_write); 691EXPORT_SYMBOL_GPL(nfs_file_write);
693 692
694ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
695 struct file *filp, loff_t *ppos,
696 size_t count, unsigned int flags)
697{
698 struct inode *inode = file_inode(filp);
699 unsigned long written = 0;
700 ssize_t ret;
701
702 dprintk("NFS splice_write(%pD2, %lu@%llu)\n",
703 filp, (unsigned long) count, (unsigned long long) *ppos);
704
705 /*
706 * The combination of splice and an O_APPEND destination is disallowed.
707 */
708
709 ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
710 if (ret > 0)
711 written = ret;
712
713 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
714 int err = vfs_fsync(filp, 0);
715 if (err < 0)
716 ret = err;
717 }
718 if (ret > 0)
719 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
720 return ret;
721}
722EXPORT_SYMBOL_GPL(nfs_file_splice_write);
723
724static int 693static int
725do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) 694do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
726{ 695{
@@ -935,10 +904,10 @@ EXPORT_SYMBOL_GPL(nfs_setlease);
935 904
936const struct file_operations nfs_file_operations = { 905const struct file_operations nfs_file_operations = {
937 .llseek = nfs_file_llseek, 906 .llseek = nfs_file_llseek,
938 .read = do_sync_read, 907 .read = new_sync_read,
939 .write = do_sync_write, 908 .write = new_sync_write,
940 .aio_read = nfs_file_read, 909 .read_iter = nfs_file_read,
941 .aio_write = nfs_file_write, 910 .write_iter = nfs_file_write,
942 .mmap = nfs_file_mmap, 911 .mmap = nfs_file_mmap,
943 .open = nfs_file_open, 912 .open = nfs_file_open,
944 .flush = nfs_file_flush, 913 .flush = nfs_file_flush,
@@ -947,7 +916,7 @@ const struct file_operations nfs_file_operations = {
947 .lock = nfs_lock, 916 .lock = nfs_lock,
948 .flock = nfs_flock, 917 .flock = nfs_flock,
949 .splice_read = nfs_file_splice_read, 918 .splice_read = nfs_file_splice_read,
950 .splice_write = nfs_file_splice_write, 919 .splice_write = iter_file_splice_write,
951 .check_flags = nfs_check_flags, 920 .check_flags = nfs_check_flags,
952 .setlease = nfs_setlease, 921 .setlease = nfs_setlease,
953}; 922};
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c496f8a74639..9927913c97c2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -147,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping)
147 return ret; 147 return ret;
148} 148}
149 149
150static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
151{
152 struct nfs_inode *nfsi = NFS_I(inode);
153
154 if (inode->i_mapping->nrpages == 0)
155 flags &= ~NFS_INO_INVALID_DATA;
156 nfsi->cache_validity |= flags;
157 if (flags & NFS_INO_INVALID_DATA)
158 nfs_fscache_invalidate(inode);
159}
160
150/* 161/*
151 * Invalidate the local caches 162 * Invalidate the local caches
152 */ 163 */
@@ -162,17 +173,16 @@ static void nfs_zap_caches_locked(struct inode *inode)
162 173
163 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); 174 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { 175 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
165 nfs_fscache_invalidate(inode); 176 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
166 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
167 | NFS_INO_INVALID_DATA 177 | NFS_INO_INVALID_DATA
168 | NFS_INO_INVALID_ACCESS 178 | NFS_INO_INVALID_ACCESS
169 | NFS_INO_INVALID_ACL 179 | NFS_INO_INVALID_ACL
170 | NFS_INO_REVAL_PAGECACHE; 180 | NFS_INO_REVAL_PAGECACHE);
171 } else 181 } else
172 nfsi->cache_validity |= NFS_INO_INVALID_ATTR 182 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
173 | NFS_INO_INVALID_ACCESS 183 | NFS_INO_INVALID_ACCESS
174 | NFS_INO_INVALID_ACL 184 | NFS_INO_INVALID_ACL
175 | NFS_INO_REVAL_PAGECACHE; 185 | NFS_INO_REVAL_PAGECACHE);
176 nfs_zap_label_cache_locked(nfsi); 186 nfs_zap_label_cache_locked(nfsi);
177} 187}
178 188
@@ -187,8 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
187{ 197{
188 if (mapping->nrpages != 0) { 198 if (mapping->nrpages != 0) {
189 spin_lock(&inode->i_lock); 199 spin_lock(&inode->i_lock);
190 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 200 nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
191 nfs_fscache_invalidate(inode);
192 spin_unlock(&inode->i_lock); 201 spin_unlock(&inode->i_lock);
193 } 202 }
194} 203}
@@ -209,7 +218,7 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);
209void nfs_invalidate_atime(struct inode *inode) 218void nfs_invalidate_atime(struct inode *inode)
210{ 219{
211 spin_lock(&inode->i_lock); 220 spin_lock(&inode->i_lock);
212 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; 221 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
213 spin_unlock(&inode->i_lock); 222 spin_unlock(&inode->i_lock);
214} 223}
215EXPORT_SYMBOL_GPL(nfs_invalidate_atime); 224EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
@@ -369,7 +378,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
369 inode->i_mode = fattr->mode; 378 inode->i_mode = fattr->mode;
370 if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 379 if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
371 && nfs_server_capable(inode, NFS_CAP_MODE)) 380 && nfs_server_capable(inode, NFS_CAP_MODE))
372 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 381 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
373 /* Why so? Because we want revalidate for devices/FIFOs, and 382 /* Why so? Because we want revalidate for devices/FIFOs, and
374 * that's precisely what we have in nfs_file_inode_operations. 383 * that's precisely what we have in nfs_file_inode_operations.
375 */ 384 */
@@ -415,36 +424,36 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
415 if (fattr->valid & NFS_ATTR_FATTR_ATIME) 424 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
416 inode->i_atime = fattr->atime; 425 inode->i_atime = fattr->atime;
417 else if (nfs_server_capable(inode, NFS_CAP_ATIME)) 426 else if (nfs_server_capable(inode, NFS_CAP_ATIME))
418 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 427 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
419 if (fattr->valid & NFS_ATTR_FATTR_MTIME) 428 if (fattr->valid & NFS_ATTR_FATTR_MTIME)
420 inode->i_mtime = fattr->mtime; 429 inode->i_mtime = fattr->mtime;
421 else if (nfs_server_capable(inode, NFS_CAP_MTIME)) 430 else if (nfs_server_capable(inode, NFS_CAP_MTIME))
422 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 431 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
423 if (fattr->valid & NFS_ATTR_FATTR_CTIME) 432 if (fattr->valid & NFS_ATTR_FATTR_CTIME)
424 inode->i_ctime = fattr->ctime; 433 inode->i_ctime = fattr->ctime;
425 else if (nfs_server_capable(inode, NFS_CAP_CTIME)) 434 else if (nfs_server_capable(inode, NFS_CAP_CTIME))
426 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 435 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
427 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) 436 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
428 inode->i_version = fattr->change_attr; 437 inode->i_version = fattr->change_attr;
429 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) 438 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
430 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 439 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
431 if (fattr->valid & NFS_ATTR_FATTR_SIZE) 440 if (fattr->valid & NFS_ATTR_FATTR_SIZE)
432 inode->i_size = nfs_size_to_loff_t(fattr->size); 441 inode->i_size = nfs_size_to_loff_t(fattr->size);
433 else 442 else
434 nfsi->cache_validity |= NFS_INO_INVALID_ATTR 443 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
435 | NFS_INO_REVAL_PAGECACHE; 444 | NFS_INO_REVAL_PAGECACHE);
436 if (fattr->valid & NFS_ATTR_FATTR_NLINK) 445 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
437 set_nlink(inode, fattr->nlink); 446 set_nlink(inode, fattr->nlink);
438 else if (nfs_server_capable(inode, NFS_CAP_NLINK)) 447 else if (nfs_server_capable(inode, NFS_CAP_NLINK))
439 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 448 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
440 if (fattr->valid & NFS_ATTR_FATTR_OWNER) 449 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
441 inode->i_uid = fattr->uid; 450 inode->i_uid = fattr->uid;
442 else if (nfs_server_capable(inode, NFS_CAP_OWNER)) 451 else if (nfs_server_capable(inode, NFS_CAP_OWNER))
443 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 452 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
444 if (fattr->valid & NFS_ATTR_FATTR_GROUP) 453 if (fattr->valid & NFS_ATTR_FATTR_GROUP)
445 inode->i_gid = fattr->gid; 454 inode->i_gid = fattr->gid;
446 else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) 455 else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
447 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 456 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
448 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) 457 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
449 inode->i_blocks = fattr->du.nfs2.blocks; 458 inode->i_blocks = fattr->du.nfs2.blocks;
450 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { 459 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -550,6 +559,9 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
550 559
551 spin_lock(&inode->i_lock); 560 spin_lock(&inode->i_lock);
552 i_size_write(inode, offset); 561 i_size_write(inode, offset);
562 /* Optimisation */
563 if (offset == 0)
564 NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
553 spin_unlock(&inode->i_lock); 565 spin_unlock(&inode->i_lock);
554 566
555 truncate_pagecache(inode, offset); 567 truncate_pagecache(inode, offset);
@@ -578,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
578 inode->i_uid = attr->ia_uid; 590 inode->i_uid = attr->ia_uid;
579 if ((attr->ia_valid & ATTR_GID) != 0) 591 if ((attr->ia_valid & ATTR_GID) != 0)
580 inode->i_gid = attr->ia_gid; 592 inode->i_gid = attr->ia_gid;
581 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 593 nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
594 | NFS_INO_INVALID_ACL);
582 spin_unlock(&inode->i_lock); 595 spin_unlock(&inode->i_lock);
583 } 596 }
584 if ((attr->ia_valid & ATTR_SIZE) != 0) { 597 if ((attr->ia_valid & ATTR_SIZE) != 0) {
@@ -1101,7 +1114,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
1101 && inode->i_version == fattr->pre_change_attr) { 1114 && inode->i_version == fattr->pre_change_attr) {
1102 inode->i_version = fattr->change_attr; 1115 inode->i_version = fattr->change_attr;
1103 if (S_ISDIR(inode->i_mode)) 1116 if (S_ISDIR(inode->i_mode))
1104 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 1117 nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
1105 ret |= NFS_INO_INVALID_ATTR; 1118 ret |= NFS_INO_INVALID_ATTR;
1106 } 1119 }
1107 /* If we have atomic WCC data, we may update some attributes */ 1120 /* If we have atomic WCC data, we may update some attributes */
@@ -1117,7 +1130,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
1117 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 1130 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
1118 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1131 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1119 if (S_ISDIR(inode->i_mode)) 1132 if (S_ISDIR(inode->i_mode))
1120 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 1133 nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
1121 ret |= NFS_INO_INVALID_ATTR; 1134 ret |= NFS_INO_INVALID_ATTR;
1122 } 1135 }
1123 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) 1136 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
@@ -1128,9 +1141,6 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
1128 ret |= NFS_INO_INVALID_ATTR; 1141 ret |= NFS_INO_INVALID_ATTR;
1129 } 1142 }
1130 1143
1131 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
1132 nfs_fscache_invalidate(inode);
1133
1134 return ret; 1144 return ret;
1135} 1145}
1136 1146
@@ -1189,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
1189 invalid |= NFS_INO_INVALID_ATIME; 1199 invalid |= NFS_INO_INVALID_ATIME;
1190 1200
1191 if (invalid != 0) 1201 if (invalid != 0)
1192 nfsi->cache_validity |= invalid; 1202 nfs_set_cache_invalid(inode, invalid);
1193 1203
1194 nfsi->read_cache_jiffies = fattr->time_start; 1204 nfsi->read_cache_jiffies = fattr->time_start;
1195 return 0; 1205 return 0;
@@ -1402,13 +1412,11 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
1402 1412
1403static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) 1413static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
1404{ 1414{
1405 struct nfs_inode *nfsi = NFS_I(inode); 1415 unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
1406 1416
1407 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1417 if (S_ISDIR(inode->i_mode))
1408 if (S_ISDIR(inode->i_mode)) { 1418 invalid |= NFS_INO_INVALID_DATA;
1409 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 1419 nfs_set_cache_invalid(inode, invalid);
1410 nfs_fscache_invalidate(inode);
1411 }
1412 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 1420 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1413 return 0; 1421 return 0;
1414 return nfs_refresh_inode_locked(inode, fattr); 1422 return nfs_refresh_inode_locked(inode, fattr);
@@ -1601,6 +1609,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1601 if ((nfsi->npages == 0) || new_isize > cur_isize) { 1609 if ((nfsi->npages == 0) || new_isize > cur_isize) {
1602 i_size_write(inode, new_isize); 1610 i_size_write(inode, new_isize);
1603 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1611 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1612 invalid &= ~NFS_INO_REVAL_PAGECACHE;
1604 } 1613 }
1605 dprintk("NFS: isize change on server for file %s/%ld " 1614 dprintk("NFS: isize change on server for file %s/%ld "
1606 "(%Ld to %Ld)\n", 1615 "(%Ld to %Ld)\n",
@@ -1702,10 +1711,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1702 invalid &= ~NFS_INO_INVALID_DATA; 1711 invalid &= ~NFS_INO_INVALID_DATA;
1703 if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || 1712 if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) ||
1704 (save_cache_validity & NFS_INO_REVAL_FORCED)) 1713 (save_cache_validity & NFS_INO_REVAL_FORCED))
1705 nfsi->cache_validity |= invalid; 1714 nfs_set_cache_invalid(inode, invalid);
1706
1707 if (invalid & NFS_INO_INVALID_DATA)
1708 nfs_fscache_invalidate(inode);
1709 1715
1710 return 0; 1716 return 0;
1711 out_err: 1717 out_err:
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8b69cba1bb04..82ddbf46660e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -327,16 +327,14 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
327int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); 327int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
328loff_t nfs_file_llseek(struct file *, loff_t, int); 328loff_t nfs_file_llseek(struct file *, loff_t, int);
329int nfs_file_flush(struct file *, fl_owner_t); 329int nfs_file_flush(struct file *, fl_owner_t);
330ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); 330ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
331ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, 331ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
332 size_t, unsigned int); 332 size_t, unsigned int);
333int nfs_file_mmap(struct file *, struct vm_area_struct *); 333int nfs_file_mmap(struct file *, struct vm_area_struct *);
334ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); 334ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
335int nfs_file_release(struct inode *, struct file *); 335int nfs_file_release(struct inode *, struct file *);
336int nfs_lock(struct file *, int, struct file_lock *); 336int nfs_lock(struct file *, int, struct file_lock *);
337int nfs_flock(struct file *, int, struct file_lock *); 337int nfs_flock(struct file *, int, struct file_lock *);
338ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *,
339 size_t, unsigned int);
340int nfs_check_flags(int); 338int nfs_check_flags(int);
341int nfs_setlease(struct file *, long, struct file_lock **); 339int nfs_setlease(struct file *, long, struct file_lock **);
342 340
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index f63cb87cd730..ba2affa51941 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -230,7 +230,7 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
230extern struct file_system_type nfs4_fs_type; 230extern struct file_system_type nfs4_fs_type;
231 231
232/* nfs4namespace.c */ 232/* nfs4namespace.c */
233struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); 233struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *);
234struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, 234struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
235 struct nfs_fh *, struct nfs_fattr *); 235 struct nfs_fh *, struct nfs_fattr *);
236int nfs4_replace_transport(struct nfs_server *server, 236int nfs4_replace_transport(struct nfs_server *server,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 464db9dd6318..a816f0627a6c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -117,10 +117,10 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
117 117
118const struct file_operations nfs4_file_operations = { 118const struct file_operations nfs4_file_operations = {
119 .llseek = nfs_file_llseek, 119 .llseek = nfs_file_llseek,
120 .read = do_sync_read, 120 .read = new_sync_read,
121 .write = do_sync_write, 121 .write = new_sync_write,
122 .aio_read = nfs_file_read, 122 .read_iter = nfs_file_read,
123 .aio_write = nfs_file_write, 123 .write_iter = nfs_file_write,
124 .mmap = nfs_file_mmap, 124 .mmap = nfs_file_mmap,
125 .open = nfs4_file_open, 125 .open = nfs4_file_open,
126 .flush = nfs_file_flush, 126 .flush = nfs_file_flush,
@@ -129,7 +129,7 @@ const struct file_operations nfs4_file_operations = {
129 .lock = nfs_lock, 129 .lock = nfs_lock,
130 .flock = nfs_flock, 130 .flock = nfs_flock,
131 .splice_read = nfs_file_splice_read, 131 .splice_read = nfs_file_splice_read,
132 .splice_write = nfs_file_splice_write, 132 .splice_write = iter_file_splice_write,
133 .check_flags = nfs_check_flags, 133 .check_flags = nfs_check_flags,
134 .setlease = nfs_setlease, 134 .setlease = nfs_setlease,
135}; 135};
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3d5dbf80d46a..3d83cb1fdc70 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -139,16 +139,22 @@ static size_t nfs_parse_server_name(char *string, size_t len,
139 * @server: NFS server struct 139 * @server: NFS server struct
140 * @flavors: List of security tuples returned by SECINFO procedure 140 * @flavors: List of security tuples returned by SECINFO procedure
141 * 141 *
142 * Return the pseudoflavor of the first security mechanism in 142 * Return an rpc client that uses the first security mechanism in
143 * "flavors" that is locally supported. Return RPC_AUTH_UNIX if 143 * "flavors" that is locally supported. The "flavors" array
144 * no matching flavor is found in the array. The "flavors" array
145 * is searched in the order returned from the server, per RFC 3530 144 * is searched in the order returned from the server, per RFC 3530
146 * recommendation. 145 * recommendation and each flavor is checked for membership in the
146 * sec= mount option list if it exists.
147 *
148 * Return -EPERM if no matching flavor is found in the array.
149 *
150 * Please call rpc_shutdown_client() when you are done with this rpc client.
151 *
147 */ 152 */
148static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, 153static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt,
154 struct nfs_server *server,
149 struct nfs4_secinfo_flavors *flavors) 155 struct nfs4_secinfo_flavors *flavors)
150{ 156{
151 rpc_authflavor_t pseudoflavor; 157 rpc_authflavor_t pflavor;
152 struct nfs4_secinfo4 *secinfo; 158 struct nfs4_secinfo4 *secinfo;
153 unsigned int i; 159 unsigned int i;
154 160
@@ -159,62 +165,73 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server,
159 case RPC_AUTH_NULL: 165 case RPC_AUTH_NULL:
160 case RPC_AUTH_UNIX: 166 case RPC_AUTH_UNIX:
161 case RPC_AUTH_GSS: 167 case RPC_AUTH_GSS:
162 pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, 168 pflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
163 &secinfo->flavor_info); 169 &secinfo->flavor_info);
164 /* make sure pseudoflavor matches sec= mount opt */ 170 /* does the pseudoflavor match a sec= mount opt? */
165 if (pseudoflavor != RPC_AUTH_MAXFLAVOR && 171 if (pflavor != RPC_AUTH_MAXFLAVOR &&
166 nfs_auth_info_match(&server->auth_info, 172 nfs_auth_info_match(&server->auth_info, pflavor)) {
167 pseudoflavor)) 173 struct rpc_clnt *new;
168 return pseudoflavor; 174 struct rpc_cred *cred;
169 break; 175
176 /* Cloning creates an rpc_auth for the flavor */
177 new = rpc_clone_client_set_auth(clnt, pflavor);
178 if (IS_ERR(new))
179 continue;
180 /**
181 * Check that the user actually can use the
182 * flavor. This is mostly for RPC_AUTH_GSS
183 * where cr_init obtains a gss context
184 */
185 cred = rpcauth_lookupcred(new->cl_auth, 0);
186 if (IS_ERR(cred)) {
187 rpc_shutdown_client(new);
188 continue;
189 }
190 put_rpccred(cred);
191 return new;
192 }
170 } 193 }
171 } 194 }
172 195 return ERR_PTR(-EPERM);
173 /* if there were any sec= options then nothing matched */
174 if (server->auth_info.flavor_len > 0)
175 return -EPERM;
176
177 return RPC_AUTH_UNIX;
178} 196}
179 197
180static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) 198/**
199 * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup,
200 * return an rpc_clnt that uses the best available security flavor with
201 * respect to the secinfo flavor list and the sec= mount options.
202 *
203 * @clnt: RPC client to clone
204 * @inode: directory inode
205 * @name: lookup name
206 *
207 * Please call rpc_shutdown_client() when you are done with this rpc client.
208 */
209struct rpc_clnt *
210nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode,
211 struct qstr *name)
181{ 212{
182 struct page *page; 213 struct page *page;
183 struct nfs4_secinfo_flavors *flavors; 214 struct nfs4_secinfo_flavors *flavors;
184 rpc_authflavor_t flavor; 215 struct rpc_clnt *new;
185 int err; 216 int err;
186 217
187 page = alloc_page(GFP_KERNEL); 218 page = alloc_page(GFP_KERNEL);
188 if (!page) 219 if (!page)
189 return -ENOMEM; 220 return ERR_PTR(-ENOMEM);
221
190 flavors = page_address(page); 222 flavors = page_address(page);
191 223
192 err = nfs4_proc_secinfo(inode, name, flavors); 224 err = nfs4_proc_secinfo(inode, name, flavors);
193 if (err < 0) { 225 if (err < 0) {
194 flavor = err; 226 new = ERR_PTR(err);
195 goto out; 227 goto out;
196 } 228 }
197 229
198 flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors); 230 new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors);
199 231
200out: 232out:
201 put_page(page); 233 put_page(page);
202 return flavor; 234 return new;
203}
204
205/*
206 * Please call rpc_shutdown_client() when you are done with this client.
207 */
208struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode,
209 struct qstr *name)
210{
211 rpc_authflavor_t flavor;
212
213 flavor = nfs4_negotiate_security(inode, name);
214 if ((int)flavor < 0)
215 return ERR_PTR((int)flavor);
216
217 return rpc_clone_client_set_auth(clnt, flavor);
218} 235}
219 236
220static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, 237static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
@@ -397,11 +414,6 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
397 414
398 if (client->cl_auth->au_flavor != flavor) 415 if (client->cl_auth->au_flavor != flavor)
399 flavor = client->cl_auth->au_flavor; 416 flavor = client->cl_auth->au_flavor;
400 else {
401 rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
402 if ((int)new >= 0)
403 flavor = new;
404 }
405 mnt = nfs_do_submount(dentry, fh, fattr, flavor); 417 mnt = nfs_do_submount(dentry, fh, fattr, flavor);
406out: 418out:
407 rpc_shutdown_client(client); 419 rpc_shutdown_client(client);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 285ad5334018..4bf3d97cc5a0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3247,7 +3247,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
3247 err = -EPERM; 3247 err = -EPERM;
3248 if (client != *clnt) 3248 if (client != *clnt)
3249 goto out; 3249 goto out;
3250 client = nfs4_create_sec_client(client, dir, name); 3250 client = nfs4_negotiate_security(client, dir, name);
3251 if (IS_ERR(client)) 3251 if (IS_ERR(client))
3252 return PTR_ERR(client); 3252 return PTR_ERR(client);
3253 3253
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3ee5af4e738e..98ff061ccaf3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -934,12 +934,14 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
934 934
935 if (nfs_have_delegated_attributes(inode)) 935 if (nfs_have_delegated_attributes(inode))
936 goto out; 936 goto out;
937 if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) 937 if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
938 return false; 938 return false;
939 smp_rmb(); 939 smp_rmb();
940 if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) 940 if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
941 return false; 941 return false;
942out: 942out:
943 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
944 return false;
943 return PageUptodate(page) != 0; 945 return PageUptodate(page) != 0;
944} 946}
945 947
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6851b003f2a4..8f029db5d271 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -617,15 +617,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
617 617
618 switch (create->cr_type) { 618 switch (create->cr_type) {
619 case NF4LNK: 619 case NF4LNK:
620 /* ugh! we have to null-terminate the linktext, or
621 * vfs_symlink() will choke. it is always safe to
622 * null-terminate by brute force, since at worst we
623 * will overwrite the first byte of the create namelen
624 * in the XDR buffer, which has already been extracted
625 * during XDR decode.
626 */
627 create->cr_linkname[create->cr_linklen] = 0;
628
629 status = nfsd_symlink(rqstp, &cstate->current_fh, 620 status = nfsd_symlink(rqstp, &cstate->current_fh,
630 create->cr_name, create->cr_namelen, 621 create->cr_name, create->cr_namelen,
631 create->cr_linkname, create->cr_linklen, 622 create->cr_linkname, create->cr_linklen,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c0d45cec9958..2204e1fe5725 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
41#include <linux/ratelimit.h> 41#include <linux/ratelimit.h>
42#include <linux/sunrpc/svcauth_gss.h> 42#include <linux/sunrpc/svcauth_gss.h>
43#include <linux/sunrpc/addr.h> 43#include <linux/sunrpc/addr.h>
44#include <linux/hash.h>
44#include "xdr4.h" 45#include "xdr4.h"
45#include "xdr4cb.h" 46#include "xdr4cb.h"
46#include "vfs.h" 47#include "vfs.h"
@@ -364,6 +365,79 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
364 return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); 365 return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
365} 366}
366 367
368/*
369 * When we recall a delegation, we should be careful not to hand it
370 * out again straight away.
371 * To ensure this we keep a pair of bloom filters ('new' and 'old')
372 * in which the filehandles of recalled delegations are "stored".
373 * If a filehandle appear in either filter, a delegation is blocked.
374 * When a delegation is recalled, the filehandle is stored in the "new"
375 * filter.
376 * Every 30 seconds we swap the filters and clear the "new" one,
377 * unless both are empty of course.
378 *
379 * Each filter is 256 bits. We hash the filehandle to 32bit and use the
380 * low 3 bytes as hash-table indices.
381 *
382 * 'state_lock', which is always held when block_delegations() is called,
383 * is used to manage concurrent access. Testing does not need the lock
384 * except when swapping the two filters.
385 */
386static struct bloom_pair {
387 int entries, old_entries;
388 time_t swap_time;
389 int new; /* index into 'set' */
390 DECLARE_BITMAP(set[2], 256);
391} blocked_delegations;
392
393static int delegation_blocked(struct knfsd_fh *fh)
394{
395 u32 hash;
396 struct bloom_pair *bd = &blocked_delegations;
397
398 if (bd->entries == 0)
399 return 0;
400 if (seconds_since_boot() - bd->swap_time > 30) {
401 spin_lock(&state_lock);
402 if (seconds_since_boot() - bd->swap_time > 30) {
403 bd->entries -= bd->old_entries;
404 bd->old_entries = bd->entries;
405 memset(bd->set[bd->new], 0,
406 sizeof(bd->set[0]));
407 bd->new = 1-bd->new;
408 bd->swap_time = seconds_since_boot();
409 }
410 spin_unlock(&state_lock);
411 }
412 hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
413 if (test_bit(hash&255, bd->set[0]) &&
414 test_bit((hash>>8)&255, bd->set[0]) &&
415 test_bit((hash>>16)&255, bd->set[0]))
416 return 1;
417
418 if (test_bit(hash&255, bd->set[1]) &&
419 test_bit((hash>>8)&255, bd->set[1]) &&
420 test_bit((hash>>16)&255, bd->set[1]))
421 return 1;
422
423 return 0;
424}
425
426static void block_delegations(struct knfsd_fh *fh)
427{
428 u32 hash;
429 struct bloom_pair *bd = &blocked_delegations;
430
431 hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
432
433 __set_bit(hash&255, bd->set[bd->new]);
434 __set_bit((hash>>8)&255, bd->set[bd->new]);
435 __set_bit((hash>>16)&255, bd->set[bd->new]);
436 if (bd->entries == 0)
437 bd->swap_time = seconds_since_boot();
438 bd->entries += 1;
439}
440
367static struct nfs4_delegation * 441static struct nfs4_delegation *
368alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh) 442alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
369{ 443{
@@ -372,6 +446,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
372 dprintk("NFSD alloc_init_deleg\n"); 446 dprintk("NFSD alloc_init_deleg\n");
373 if (num_delegations > max_delegations) 447 if (num_delegations > max_delegations)
374 return NULL; 448 return NULL;
449 if (delegation_blocked(&current_fh->fh_handle))
450 return NULL;
375 dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); 451 dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
376 if (dp == NULL) 452 if (dp == NULL)
377 return dp; 453 return dp;
@@ -2770,6 +2846,8 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
2770 /* Only place dl_time is set; protected by i_lock: */ 2846 /* Only place dl_time is set; protected by i_lock: */
2771 dp->dl_time = get_seconds(); 2847 dp->dl_time = get_seconds();
2772 2848
2849 block_delegations(&dp->dl_fh);
2850
2773 nfsd4_cb_recall(dp); 2851 nfsd4_cb_recall(dp);
2774} 2852}
2775 2853
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2d305a121f37..2fc7abebeb9b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -600,7 +600,18 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
600 READ_BUF(4); 600 READ_BUF(4);
601 create->cr_linklen = be32_to_cpup(p++); 601 create->cr_linklen = be32_to_cpup(p++);
602 READ_BUF(create->cr_linklen); 602 READ_BUF(create->cr_linklen);
603 SAVEMEM(create->cr_linkname, create->cr_linklen); 603 /*
604 * The VFS will want a null-terminated string, and
605 * null-terminating in place isn't safe since this might
606 * end on a page boundary:
607 */
608 create->cr_linkname =
609 kmalloc(create->cr_linklen + 1, GFP_KERNEL);
610 if (!create->cr_linkname)
611 return nfserr_jukebox;
612 memcpy(create->cr_linkname, p, create->cr_linklen);
613 create->cr_linkname[create->cr_linklen] = '\0';
614 defer_free(argp, kfree, create->cr_linkname);
604 break; 615 break;
605 case NF4BLK: 616 case NF4BLK:
606 case NF4CHR: 617 case NF4CHR:
@@ -2687,6 +2698,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2687 nfserr = nfserr_toosmall; 2698 nfserr = nfserr_toosmall;
2688 goto fail; 2699 goto fail;
2689 case nfserr_noent: 2700 case nfserr_noent:
2701 xdr_truncate_encode(xdr, start_offset);
2690 goto skip_entry; 2702 goto skip_entry;
2691 default: 2703 default:
2692 /* 2704 /*
@@ -3266,7 +3278,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
3266 3278
3267 wire_count = htonl(maxcount); 3279 wire_count = htonl(maxcount);
3268 write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); 3280 write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4);
3269 xdr_truncate_encode(xdr, length_offset + 4 + maxcount); 3281 xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4));
3270 if (maxcount & 3) 3282 if (maxcount & 3)
3271 write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, 3283 write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
3272 &zero, 4 - (maxcount&3)); 3284 &zero, 4 - (maxcount&3));
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index f3a82fbcae02..24978153c0c4 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -152,10 +152,10 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
152 */ 152 */
153const struct file_operations nilfs_file_operations = { 153const struct file_operations nilfs_file_operations = {
154 .llseek = generic_file_llseek, 154 .llseek = generic_file_llseek,
155 .read = do_sync_read, 155 .read = new_sync_read,
156 .write = do_sync_write, 156 .write = new_sync_write,
157 .aio_read = generic_file_aio_read, 157 .read_iter = generic_file_read_iter,
158 .aio_write = generic_file_aio_write, 158 .write_iter = generic_file_write_iter,
159 .unlocked_ioctl = nilfs_ioctl, 159 .unlocked_ioctl = nilfs_ioctl,
160#ifdef CONFIG_COMPAT 160#ifdef CONFIG_COMPAT
161 .compat_ioctl = nilfs_compat_ioctl, 161 .compat_ioctl = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b9c5726120e3..6252b173a465 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -298,19 +298,20 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
298} 298}
299 299
300static ssize_t 300static ssize_t
301nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 301nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
302 loff_t offset, unsigned long nr_segs) 302 loff_t offset)
303{ 303{
304 struct file *file = iocb->ki_filp; 304 struct file *file = iocb->ki_filp;
305 struct address_space *mapping = file->f_mapping; 305 struct address_space *mapping = file->f_mapping;
306 struct inode *inode = file->f_mapping->host; 306 struct inode *inode = file->f_mapping->host;
307 size_t count = iov_iter_count(iter);
307 ssize_t size; 308 ssize_t size;
308 309
309 if (rw == WRITE) 310 if (rw == WRITE)
310 return 0; 311 return 0;
311 312
312 /* Needs synchronization with the cleaner */ 313 /* Needs synchronization with the cleaner */
313 size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 314 size = blockdev_direct_IO(rw, iocb, inode, iter, offset,
314 nilfs_get_block); 315 nilfs_get_block);
315 316
316 /* 317 /*
@@ -319,7 +320,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
319 */ 320 */
320 if (unlikely((rw & WRITE) && size < 0)) { 321 if (unlikely((rw & WRITE) && size < 0)) {
321 loff_t isize = i_size_read(inode); 322 loff_t isize = i_size_read(inode);
322 loff_t end = offset + iov_length(iov, nr_segs); 323 loff_t end = offset + count;
323 324
324 if (end > isize) 325 if (end > isize)
325 nilfs_write_failed(mapping, end); 326 nilfs_write_failed(mapping, end);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 86ddab916b66..5c9e2c81cb11 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2090,10 +2090,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2090 size_t count; /* after file limit checks */ 2090 size_t count; /* after file limit checks */
2091 ssize_t written, err; 2091 ssize_t written, err;
2092 2092
2093 count = 0; 2093 count = iov_length(iov, nr_segs);
2094 err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
2095 if (err)
2096 return err;
2097 pos = *ppos; 2094 pos = *ppos;
2098 /* We can write back this queue in page reclaim. */ 2095 /* We can write back this queue in page reclaim. */
2099 current->backing_dev_info = mapping->backing_dev_info; 2096 current->backing_dev_info = mapping->backing_dev_info;
@@ -2202,8 +2199,8 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
2202 2199
2203const struct file_operations ntfs_file_ops = { 2200const struct file_operations ntfs_file_ops = {
2204 .llseek = generic_file_llseek, /* Seek inside file. */ 2201 .llseek = generic_file_llseek, /* Seek inside file. */
2205 .read = do_sync_read, /* Read from file. */ 2202 .read = new_sync_read, /* Read from file. */
2206 .aio_read = generic_file_aio_read, /* Async read from file. */ 2203 .read_iter = generic_file_read_iter, /* Async read from file. */
2207#ifdef NTFS_RW 2204#ifdef NTFS_RW
2208 .write = do_sync_write, /* Write to file. */ 2205 .write = do_sync_write, /* Write to file. */
2209 .aio_write = ntfs_file_aio_write, /* Async write to file. */ 2206 .aio_write = ntfs_file_aio_write, /* Async write to file. */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index d310d12a9adc..4a231a166cf8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -599,9 +599,8 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
599 599
600static ssize_t ocfs2_direct_IO(int rw, 600static ssize_t ocfs2_direct_IO(int rw,
601 struct kiocb *iocb, 601 struct kiocb *iocb,
602 const struct iovec *iov, 602 struct iov_iter *iter,
603 loff_t offset, 603 loff_t offset)
604 unsigned long nr_segs)
605{ 604{
606 struct file *file = iocb->ki_filp; 605 struct file *file = iocb->ki_filp;
607 struct inode *inode = file_inode(file)->i_mapping->host; 606 struct inode *inode = file_inode(file)->i_mapping->host;
@@ -618,7 +617,7 @@ static ssize_t ocfs2_direct_IO(int rw,
618 return 0; 617 return 0;
619 618
620 return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, 619 return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
621 iov, offset, nr_segs, 620 iter, offset,
622 ocfs2_direct_IO_get_blocks, 621 ocfs2_direct_IO_get_blocks,
623 ocfs2_dio_end_io, NULL, 0); 622 ocfs2_dio_end_io, NULL, 0);
624} 623}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index a106b3f2b22a..fae17c640df3 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -331,6 +331,7 @@ struct dlm_lock_resource
331 u16 state; 331 u16 state;
332 char lvb[DLM_LVB_LEN]; 332 char lvb[DLM_LVB_LEN];
333 unsigned int inflight_locks; 333 unsigned int inflight_locks;
334 unsigned int inflight_assert_workers;
334 unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 335 unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
335}; 336};
336 337
@@ -910,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
910void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 911void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
911 struct dlm_lock_resource *res); 912 struct dlm_lock_resource *res);
912 913
914void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
915 struct dlm_lock_resource *res);
916
913void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 917void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
914void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 918void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
915void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 919void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3087a21d32f9..82abf0cc9a12 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
581 atomic_set(&res->asts_reserved, 0); 581 atomic_set(&res->asts_reserved, 0);
582 res->migration_pending = 0; 582 res->migration_pending = 0;
583 res->inflight_locks = 0; 583 res->inflight_locks = 0;
584 res->inflight_assert_workers = 0;
584 585
585 res->dlm = dlm; 586 res->dlm = dlm;
586 587
@@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
683 wake_up(&res->wq); 684 wake_up(&res->wq);
684} 685}
685 686
687void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
688 struct dlm_lock_resource *res)
689{
690 assert_spin_locked(&res->spinlock);
691 res->inflight_assert_workers++;
692 mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
693 dlm->name, res->lockname.len, res->lockname.name,
694 res->inflight_assert_workers);
695}
696
697static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
698 struct dlm_lock_resource *res)
699{
700 spin_lock(&res->spinlock);
701 __dlm_lockres_grab_inflight_worker(dlm, res);
702 spin_unlock(&res->spinlock);
703}
704
705static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
706 struct dlm_lock_resource *res)
707{
708 assert_spin_locked(&res->spinlock);
709 BUG_ON(res->inflight_assert_workers == 0);
710 res->inflight_assert_workers--;
711 mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
712 dlm->name, res->lockname.len, res->lockname.name,
713 res->inflight_assert_workers);
714}
715
716static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
717 struct dlm_lock_resource *res)
718{
719 spin_lock(&res->spinlock);
720 __dlm_lockres_drop_inflight_worker(dlm, res);
721 spin_unlock(&res->spinlock);
722}
723
686/* 724/*
687 * lookup a lock resource by name. 725 * lookup a lock resource by name.
688 * may already exist in the hashtable. 726 * may already exist in the hashtable.
@@ -1603,7 +1641,8 @@ send_response:
1603 mlog(ML_ERROR, "failed to dispatch assert master work\n"); 1641 mlog(ML_ERROR, "failed to dispatch assert master work\n");
1604 response = DLM_MASTER_RESP_ERROR; 1642 response = DLM_MASTER_RESP_ERROR;
1605 dlm_lockres_put(res); 1643 dlm_lockres_put(res);
1606 } 1644 } else
1645 dlm_lockres_grab_inflight_worker(dlm, res);
1607 } else { 1646 } else {
1608 if (res) 1647 if (res)
1609 dlm_lockres_put(res); 1648 dlm_lockres_put(res);
@@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
2118 dlm_lockres_release_ast(dlm, res); 2157 dlm_lockres_release_ast(dlm, res);
2119 2158
2120put: 2159put:
2160 dlm_lockres_drop_inflight_worker(dlm, res);
2161
2121 dlm_lockres_put(res); 2162 dlm_lockres_put(res);
2122 2163
2123 mlog(0, "finished with dlm_assert_master_worker\n"); 2164 mlog(0, "finished with dlm_assert_master_worker\n");
@@ -3088,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3088 /* remove it so that only one mle will be found */ 3129 /* remove it so that only one mle will be found */
3089 __dlm_unlink_mle(dlm, tmp); 3130 __dlm_unlink_mle(dlm, tmp);
3090 __dlm_mle_detach_hb_events(dlm, tmp); 3131 __dlm_mle_detach_hb_events(dlm, tmp);
3091 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3132 if (tmp->type == DLM_MLE_MASTER) {
3092 mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3133 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3093 "telling master to get ref for cleared out mle " 3134 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3094 "during migration\n", dlm->name, namelen, name, 3135 "telling master to get ref "
3095 master, new_master); 3136 "for cleared out mle during "
3137 "migration\n", dlm->name,
3138 namelen, name, master,
3139 new_master);
3140 }
3096 } 3141 }
3097 spin_unlock(&tmp->spinlock); 3142 spin_unlock(&tmp->spinlock);
3098 } 3143 }
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 5de019437ea5..45067faf5695 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
1708 mlog_errno(-ENOMEM); 1708 mlog_errno(-ENOMEM);
1709 /* retry!? */ 1709 /* retry!? */
1710 BUG(); 1710 BUG();
1711 } 1711 } else
1712 __dlm_lockres_grab_inflight_worker(dlm, res);
1712 } else /* put.. incase we are not the master */ 1713 } else /* put.. incase we are not the master */
1713 dlm_lockres_put(res); 1714 dlm_lockres_put(res);
1714 spin_unlock(&res->spinlock); 1715 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 9db869de829d..69aac6f088ad 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
259 * refs on it. */ 259 * refs on it. */
260 unused = __dlm_lockres_unused(lockres); 260 unused = __dlm_lockres_unused(lockres);
261 if (!unused || 261 if (!unused ||
262 (lockres->state & DLM_LOCK_RES_MIGRATING)) { 262 (lockres->state & DLM_LOCK_RES_MIGRATING) ||
263 (lockres->inflight_assert_workers != 0)) {
263 mlog(0, "%s: res %.*s is in use or being remastered, " 264 mlog(0, "%s: res %.*s is in use or being remastered, "
264 "used %d, state %d\n", dlm->name, 265 "used %d, state %d, assert master workers %u\n",
265 lockres->lockname.len, lockres->lockname.name, 266 dlm->name, lockres->lockname.len,
266 !unused, lockres->state); 267 lockres->lockname.name,
267 list_move_tail(&dlm->purge_list, &lockres->purge); 268 !unused, lockres->state,
269 lockres->inflight_assert_workers);
270 list_move_tail(&lockres->purge, &dlm->purge_list);
268 spin_unlock(&lockres->spinlock); 271 spin_unlock(&lockres->spinlock);
269 continue; 272 continue;
270 } 273 }
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 5698b52cf5c9..2e3c9dbab68c 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
191 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 191 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
192 } else if (status == DLM_RECOVERING || 192 } else if (status == DLM_RECOVERING ||
193 status == DLM_MIGRATING || 193 status == DLM_MIGRATING ||
194 status == DLM_FORWARD) { 194 status == DLM_FORWARD ||
195 status == DLM_NOLOCKMGR
196 ) {
195 /* must clear the actions because this unlock 197 /* must clear the actions because this unlock
196 * is about to be retried. cannot free or do 198 * is about to be retried. cannot free or do
197 * any list manipulation. */ 199 * any list manipulation. */
@@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
200 res->lockname.name, 202 res->lockname.name,
201 status==DLM_RECOVERING?"recovering": 203 status==DLM_RECOVERING?"recovering":
202 (status==DLM_MIGRATING?"migrating": 204 (status==DLM_MIGRATING?"migrating":
203 "forward")); 205 (status == DLM_FORWARD ? "forward" :
206 "nolockmanager")));
204 actions = 0; 207 actions = 0;
205 } 208 }
206 if (flags & LKM_CANCEL) 209 if (flags & LKM_CANCEL)
@@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
364 * updated state to the recovery master. this thread 367 * updated state to the recovery master. this thread
365 * just needs to finish out the operation and call 368 * just needs to finish out the operation and call
366 * the unlockast. */ 369 * the unlockast. */
367 ret = DLM_NORMAL; 370 if (dlm_is_node_dead(dlm, owner))
371 ret = DLM_NORMAL;
372 else
373 ret = DLM_NOLOCKMGR;
368 } else { 374 } else {
369 /* something bad. this will BUG in ocfs2 */ 375 /* something bad. this will BUG in ocfs2 */
370 ret = dlm_err_to_dlm_status(tmpret); 376 ret = dlm_err_to_dlm_status(tmpret);
@@ -638,7 +644,9 @@ retry:
638 644
639 if (status == DLM_RECOVERING || 645 if (status == DLM_RECOVERING ||
640 status == DLM_MIGRATING || 646 status == DLM_MIGRATING ||
641 status == DLM_FORWARD) { 647 status == DLM_FORWARD ||
648 status == DLM_NOLOCKMGR) {
649
642 /* We want to go away for a tiny bit to allow recovery 650 /* We want to go away for a tiny bit to allow recovery
643 * / migration to complete on this resource. I don't 651 * / migration to complete on this resource. I don't
644 * know of any wait queue we could sleep on as this 652 * know of any wait queue we could sleep on as this
@@ -650,7 +658,7 @@ retry:
650 msleep(50); 658 msleep(50);
651 659
652 mlog(0, "retrying unlock due to pending recovery/" 660 mlog(0, "retrying unlock due to pending recovery/"
653 "migration/in-progress\n"); 661 "migration/in-progress/reconnect\n");
654 goto retry; 662 goto retry;
655 } 663 }
656 664
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8eb6e5732d3b..2930e231f3f9 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2233,16 +2233,13 @@ out:
2233 return ret; 2233 return ret;
2234} 2234}
2235 2235
2236static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 2236static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2237 const struct iovec *iov, 2237 struct iov_iter *from)
2238 unsigned long nr_segs,
2239 loff_t pos)
2240{ 2238{
2241 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 2239 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
2242 int can_do_direct, has_refcount = 0; 2240 int can_do_direct, has_refcount = 0;
2243 ssize_t written = 0; 2241 ssize_t written = 0;
2244 size_t ocount; /* original count */ 2242 size_t count = iov_iter_count(from);
2245 size_t count; /* after file limit checks */
2246 loff_t old_size, *ppos = &iocb->ki_pos; 2243 loff_t old_size, *ppos = &iocb->ki_pos;
2247 u32 old_clusters; 2244 u32 old_clusters;
2248 struct file *file = iocb->ki_filp; 2245 struct file *file = iocb->ki_filp;
@@ -2256,7 +2253,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2256 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2253 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2257 file->f_path.dentry->d_name.len, 2254 file->f_path.dentry->d_name.len,
2258 file->f_path.dentry->d_name.name, 2255 file->f_path.dentry->d_name.name,
2259 (unsigned int)nr_segs); 2256 (unsigned int)from->nr_segs); /* GRRRRR */
2260 2257
2261 if (iocb->ki_nbytes == 0) 2258 if (iocb->ki_nbytes == 0)
2262 return 0; 2259 return 0;
@@ -2354,29 +2351,21 @@ relock:
2354 /* communicate with ocfs2_dio_end_io */ 2351 /* communicate with ocfs2_dio_end_io */
2355 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2352 ocfs2_iocb_set_rw_locked(iocb, rw_level);
2356 2353
2357 ret = generic_segment_checks(iov, &nr_segs, &ocount,
2358 VERIFY_READ);
2359 if (ret)
2360 goto out_dio;
2361
2362 count = ocount;
2363 ret = generic_write_checks(file, ppos, &count, 2354 ret = generic_write_checks(file, ppos, &count,
2364 S_ISBLK(inode->i_mode)); 2355 S_ISBLK(inode->i_mode));
2365 if (ret) 2356 if (ret)
2366 goto out_dio; 2357 goto out_dio;
2367 2358
2359 iov_iter_truncate(from, count);
2368 if (direct_io) { 2360 if (direct_io) {
2369 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2361 written = generic_file_direct_write(iocb, from, *ppos);
2370 count, ocount);
2371 if (written < 0) { 2362 if (written < 0) {
2372 ret = written; 2363 ret = written;
2373 goto out_dio; 2364 goto out_dio;
2374 } 2365 }
2375 } else { 2366 } else {
2376 struct iov_iter from;
2377 iov_iter_init(&from, iov, nr_segs, count, 0);
2378 current->backing_dev_info = file->f_mapping->backing_dev_info; 2367 current->backing_dev_info = file->f_mapping->backing_dev_info;
2379 written = generic_perform_write(file, &from, *ppos); 2368 written = generic_perform_write(file, from, *ppos);
2380 if (likely(written >= 0)) 2369 if (likely(written >= 0))
2381 iocb->ki_pos = *ppos + written; 2370 iocb->ki_pos = *ppos + written;
2382 current->backing_dev_info = NULL; 2371 current->backing_dev_info = NULL;
@@ -2441,84 +2430,6 @@ out_sems:
2441 return ret; 2430 return ret;
2442} 2431}
2443 2432
2444static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2445 struct file *out,
2446 struct splice_desc *sd)
2447{
2448 int ret;
2449
2450 ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2451 sd->total_len, 0, NULL, NULL);
2452 if (ret < 0) {
2453 mlog_errno(ret);
2454 return ret;
2455 }
2456
2457 return splice_from_pipe_feed(pipe, sd, pipe_to_file);
2458}
2459
2460static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2461 struct file *out,
2462 loff_t *ppos,
2463 size_t len,
2464 unsigned int flags)
2465{
2466 int ret;
2467 struct address_space *mapping = out->f_mapping;
2468 struct inode *inode = mapping->host;
2469 struct splice_desc sd = {
2470 .total_len = len,
2471 .flags = flags,
2472 .pos = *ppos,
2473 .u.file = out,
2474 };
2475
2476
2477 trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
2478 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2479 out->f_path.dentry->d_name.len,
2480 out->f_path.dentry->d_name.name, len);
2481
2482 pipe_lock(pipe);
2483
2484 splice_from_pipe_begin(&sd);
2485 do {
2486 ret = splice_from_pipe_next(pipe, &sd);
2487 if (ret <= 0)
2488 break;
2489
2490 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2491 ret = ocfs2_rw_lock(inode, 1);
2492 if (ret < 0)
2493 mlog_errno(ret);
2494 else {
2495 ret = ocfs2_splice_to_file(pipe, out, &sd);
2496 ocfs2_rw_unlock(inode, 1);
2497 }
2498 mutex_unlock(&inode->i_mutex);
2499 } while (ret > 0);
2500 splice_from_pipe_end(pipe, &sd);
2501
2502 pipe_unlock(pipe);
2503
2504 if (sd.num_spliced)
2505 ret = sd.num_spliced;
2506
2507 if (ret > 0) {
2508 int err;
2509
2510 err = generic_write_sync(out, *ppos, ret);
2511 if (err)
2512 ret = err;
2513 else
2514 *ppos += ret;
2515
2516 balance_dirty_pages_ratelimited(mapping);
2517 }
2518
2519 return ret;
2520}
2521
2522static ssize_t ocfs2_file_splice_read(struct file *in, 2433static ssize_t ocfs2_file_splice_read(struct file *in,
2523 loff_t *ppos, 2434 loff_t *ppos,
2524 struct pipe_inode_info *pipe, 2435 struct pipe_inode_info *pipe,
@@ -2534,7 +2445,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2534 in->f_path.dentry->d_name.name, len); 2445 in->f_path.dentry->d_name.name, len);
2535 2446
2536 /* 2447 /*
2537 * See the comment in ocfs2_file_aio_read() 2448 * See the comment in ocfs2_file_read_iter()
2538 */ 2449 */
2539 ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); 2450 ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
2540 if (ret < 0) { 2451 if (ret < 0) {
@@ -2549,10 +2460,8 @@ bail:
2549 return ret; 2460 return ret;
2550} 2461}
2551 2462
2552static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 2463static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2553 const struct iovec *iov, 2464 struct iov_iter *to)
2554 unsigned long nr_segs,
2555 loff_t pos)
2556{ 2465{
2557 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 2466 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
2558 struct file *filp = iocb->ki_filp; 2467 struct file *filp = iocb->ki_filp;
@@ -2561,7 +2470,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2561 trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, 2470 trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
2562 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2471 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2563 filp->f_path.dentry->d_name.len, 2472 filp->f_path.dentry->d_name.len,
2564 filp->f_path.dentry->d_name.name, nr_segs); 2473 filp->f_path.dentry->d_name.name,
2474 to->nr_segs); /* GRRRRR */
2565 2475
2566 2476
2567 if (!inode) { 2477 if (!inode) {
@@ -2606,13 +2516,13 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2606 } 2516 }
2607 ocfs2_inode_unlock(inode, lock_level); 2517 ocfs2_inode_unlock(inode, lock_level);
2608 2518
2609 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2519 ret = generic_file_read_iter(iocb, to);
2610 trace_generic_file_aio_read_ret(ret); 2520 trace_generic_file_aio_read_ret(ret);
2611 2521
2612 /* buffered aio wouldn't have proper lock coverage today */ 2522 /* buffered aio wouldn't have proper lock coverage today */
2613 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2523 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2614 2524
2615 /* see ocfs2_file_aio_write */ 2525 /* see ocfs2_file_write_iter */
2616 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2526 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2617 rw_level = -1; 2527 rw_level = -1;
2618 have_alloc_sem = 0; 2528 have_alloc_sem = 0;
@@ -2705,14 +2615,14 @@ const struct inode_operations ocfs2_special_file_iops = {
2705 */ 2615 */
2706const struct file_operations ocfs2_fops = { 2616const struct file_operations ocfs2_fops = {
2707 .llseek = ocfs2_file_llseek, 2617 .llseek = ocfs2_file_llseek,
2708 .read = do_sync_read, 2618 .read = new_sync_read,
2709 .write = do_sync_write, 2619 .write = new_sync_write,
2710 .mmap = ocfs2_mmap, 2620 .mmap = ocfs2_mmap,
2711 .fsync = ocfs2_sync_file, 2621 .fsync = ocfs2_sync_file,
2712 .release = ocfs2_file_release, 2622 .release = ocfs2_file_release,
2713 .open = ocfs2_file_open, 2623 .open = ocfs2_file_open,
2714 .aio_read = ocfs2_file_aio_read, 2624 .read_iter = ocfs2_file_read_iter,
2715 .aio_write = ocfs2_file_aio_write, 2625 .write_iter = ocfs2_file_write_iter,
2716 .unlocked_ioctl = ocfs2_ioctl, 2626 .unlocked_ioctl = ocfs2_ioctl,
2717#ifdef CONFIG_COMPAT 2627#ifdef CONFIG_COMPAT
2718 .compat_ioctl = ocfs2_compat_ioctl, 2628 .compat_ioctl = ocfs2_compat_ioctl,
@@ -2720,7 +2630,7 @@ const struct file_operations ocfs2_fops = {
2720 .lock = ocfs2_lock, 2630 .lock = ocfs2_lock,
2721 .flock = ocfs2_flock, 2631 .flock = ocfs2_flock,
2722 .splice_read = ocfs2_file_splice_read, 2632 .splice_read = ocfs2_file_splice_read,
2723 .splice_write = ocfs2_file_splice_write, 2633 .splice_write = iter_file_splice_write,
2724 .fallocate = ocfs2_fallocate, 2634 .fallocate = ocfs2_fallocate,
2725}; 2635};
2726 2636
@@ -2753,21 +2663,21 @@ const struct file_operations ocfs2_dops = {
2753 */ 2663 */
2754const struct file_operations ocfs2_fops_no_plocks = { 2664const struct file_operations ocfs2_fops_no_plocks = {
2755 .llseek = ocfs2_file_llseek, 2665 .llseek = ocfs2_file_llseek,
2756 .read = do_sync_read, 2666 .read = new_sync_read,
2757 .write = do_sync_write, 2667 .write = new_sync_write,
2758 .mmap = ocfs2_mmap, 2668 .mmap = ocfs2_mmap,
2759 .fsync = ocfs2_sync_file, 2669 .fsync = ocfs2_sync_file,
2760 .release = ocfs2_file_release, 2670 .release = ocfs2_file_release,
2761 .open = ocfs2_file_open, 2671 .open = ocfs2_file_open,
2762 .aio_read = ocfs2_file_aio_read, 2672 .read_iter = ocfs2_file_read_iter,
2763 .aio_write = ocfs2_file_aio_write, 2673 .write_iter = ocfs2_file_write_iter,
2764 .unlocked_ioctl = ocfs2_ioctl, 2674 .unlocked_ioctl = ocfs2_ioctl,
2765#ifdef CONFIG_COMPAT 2675#ifdef CONFIG_COMPAT
2766 .compat_ioctl = ocfs2_compat_ioctl, 2676 .compat_ioctl = ocfs2_compat_ioctl,
2767#endif 2677#endif
2768 .flock = ocfs2_flock, 2678 .flock = ocfs2_flock,
2769 .splice_read = ocfs2_file_splice_read, 2679 .splice_read = ocfs2_file_splice_read,
2770 .splice_write = ocfs2_file_splice_write, 2680 .splice_write = iter_file_splice_write,
2771 .fallocate = ocfs2_fallocate, 2681 .fallocate = ocfs2_fallocate,
2772}; 2682};
2773 2683
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2060fc398445..8add6f1030d7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
205 return inode; 205 return inode;
206} 206}
207 207
208static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
209 struct dentry *dentry, struct inode *inode)
210{
211 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
212
213 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
214 ocfs2_lock_res_free(&dl->dl_lockres);
215 BUG_ON(dl->dl_count != 1);
216 spin_lock(&dentry_attach_lock);
217 dentry->d_fsdata = NULL;
218 spin_unlock(&dentry_attach_lock);
219 kfree(dl);
220 iput(inode);
221}
222
208static int ocfs2_mknod(struct inode *dir, 223static int ocfs2_mknod(struct inode *dir,
209 struct dentry *dentry, 224 struct dentry *dentry,
210 umode_t mode, 225 umode_t mode,
@@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir,
231 sigset_t oldset; 246 sigset_t oldset;
232 int did_block_signals = 0; 247 int did_block_signals = 0;
233 struct posix_acl *default_acl = NULL, *acl = NULL; 248 struct posix_acl *default_acl = NULL, *acl = NULL;
249 struct ocfs2_dentry_lock *dl = NULL;
234 250
235 trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, 251 trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
236 (unsigned long long)OCFS2_I(dir)->ip_blkno, 252 (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir,
423 goto leave; 439 goto leave;
424 } 440 }
425 441
442 dl = dentry->d_fsdata;
443
426 status = ocfs2_add_entry(handle, dentry, inode, 444 status = ocfs2_add_entry(handle, dentry, inode,
427 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 445 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
428 &lookup); 446 &lookup);
@@ -469,6 +487,9 @@ leave:
469 * ocfs2_delete_inode will mutex_lock again. 487 * ocfs2_delete_inode will mutex_lock again.
470 */ 488 */
471 if ((status < 0) && inode) { 489 if ((status < 0) && inode) {
490 if (dl)
491 ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
492
472 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; 493 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
473 clear_nlink(inode); 494 clear_nlink(inode);
474 iput(inode); 495 iput(inode);
@@ -991,6 +1012,65 @@ leave:
991 return status; 1012 return status;
992} 1013}
993 1014
1015static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
1016 u64 src_inode_no, u64 dest_inode_no)
1017{
1018 int ret = 0, i = 0;
1019 u64 parent_inode_no = 0;
1020 u64 child_inode_no = src_inode_no;
1021 struct inode *child_inode;
1022
1023#define MAX_LOOKUP_TIMES 32
1024 while (1) {
1025 child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
1026 if (IS_ERR(child_inode)) {
1027 ret = PTR_ERR(child_inode);
1028 break;
1029 }
1030
1031 ret = ocfs2_inode_lock(child_inode, NULL, 0);
1032 if (ret < 0) {
1033 iput(child_inode);
1034 if (ret != -ENOENT)
1035 mlog_errno(ret);
1036 break;
1037 }
1038
1039 ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
1040 &parent_inode_no);
1041 ocfs2_inode_unlock(child_inode, 0);
1042 iput(child_inode);
1043 if (ret < 0) {
1044 ret = -ENOENT;
1045 break;
1046 }
1047
1048 if (parent_inode_no == dest_inode_no) {
1049 ret = 1;
1050 break;
1051 }
1052
1053 if (parent_inode_no == osb->root_inode->i_ino) {
1054 ret = 0;
1055 break;
1056 }
1057
1058 child_inode_no = parent_inode_no;
1059
1060 if (++i >= MAX_LOOKUP_TIMES) {
1061 mlog(ML_NOTICE, "max lookup times reached, filesystem "
1062 "may have nested directories, "
1063 "src inode: %llu, dest inode: %llu.\n",
1064 (unsigned long long)src_inode_no,
1065 (unsigned long long)dest_inode_no);
1066 ret = 0;
1067 break;
1068 }
1069 }
1070
1071 return ret;
1072}
1073
994/* 1074/*
995 * The only place this should be used is rename! 1075 * The only place this should be used is rename!
996 * if they have the same id, then the 1st one is the only one locked. 1076 * if they have the same id, then the 1st one is the only one locked.
@@ -1002,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1002 struct inode *inode2) 1082 struct inode *inode2)
1003{ 1083{
1004 int status; 1084 int status;
1085 int inode1_is_ancestor, inode2_is_ancestor;
1005 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); 1086 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
1006 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); 1087 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
1007 struct buffer_head **tmpbh; 1088 struct buffer_head **tmpbh;
@@ -1015,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1015 if (*bh2) 1096 if (*bh2)
1016 *bh2 = NULL; 1097 *bh2 = NULL;
1017 1098
1018 /* we always want to lock the one with the lower lockid first. */ 1099 /* we always want to lock the one with the lower lockid first.
1100 * and if they are nested, we lock ancestor first */
1019 if (oi1->ip_blkno != oi2->ip_blkno) { 1101 if (oi1->ip_blkno != oi2->ip_blkno) {
1020 if (oi1->ip_blkno < oi2->ip_blkno) { 1102 inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
1103 oi1->ip_blkno);
1104 if (inode1_is_ancestor < 0) {
1105 status = inode1_is_ancestor;
1106 goto bail;
1107 }
1108
1109 inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
1110 oi2->ip_blkno);
1111 if (inode2_is_ancestor < 0) {
1112 status = inode2_is_ancestor;
1113 goto bail;
1114 }
1115
1116 if ((inode1_is_ancestor == 1) ||
1117 (oi1->ip_blkno < oi2->ip_blkno &&
1118 inode2_is_ancestor == 0)) {
1021 /* switch id1 and id2 around */ 1119 /* switch id1 and id2 around */
1022 tmpbh = bh2; 1120 tmpbh = bh2;
1023 bh2 = bh1; 1121 bh2 = bh1;
@@ -1098,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir,
1098 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; 1196 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
1099 struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; 1197 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
1100 struct ocfs2_dir_lookup_result target_insert = { NULL, }; 1198 struct ocfs2_dir_lookup_result target_insert = { NULL, };
1199 bool should_add_orphan = false;
1101 1200
1102 /* At some point it might be nice to break this function up a 1201 /* At some point it might be nice to break this function up a
1103 * bit. */ 1202 * bit. */
@@ -1134,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir,
1134 goto bail; 1233 goto bail;
1135 } 1234 }
1136 rename_lock = 1; 1235 rename_lock = 1;
1236
1237 /* here we cannot guarantee the inodes haven't just been
1238 * changed, so check if they are nested again */
1239 status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
1240 old_inode->i_ino);
1241 if (status < 0) {
1242 mlog_errno(status);
1243 goto bail;
1244 } else if (status == 1) {
1245 status = -EPERM;
1246 trace_ocfs2_rename_not_permitted(
1247 (unsigned long long)old_inode->i_ino,
1248 (unsigned long long)new_dir->i_ino);
1249 goto bail;
1250 }
1137 } 1251 }
1138 1252
1139 /* if old and new are the same, this'll just do one lock. */ 1253 /* if old and new are the same, this'll just do one lock. */
@@ -1304,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
1304 mlog_errno(status); 1418 mlog_errno(status);
1305 goto bail; 1419 goto bail;
1306 } 1420 }
1421 should_add_orphan = true;
1307 } 1422 }
1308 } else { 1423 } else {
1309 BUG_ON(new_dentry->d_parent->d_inode != new_dir); 1424 BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1348,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir,
1348 goto bail; 1463 goto bail;
1349 } 1464 }
1350 1465
1351 if (S_ISDIR(new_inode->i_mode) ||
1352 (ocfs2_read_links_count(newfe) == 1)) {
1353 status = ocfs2_orphan_add(osb, handle, new_inode,
1354 newfe_bh, orphan_name,
1355 &orphan_insert, orphan_dir);
1356 if (status < 0) {
1357 mlog_errno(status);
1358 goto bail;
1359 }
1360 }
1361
1362 /* change the dirent to point to the correct inode */ 1466 /* change the dirent to point to the correct inode */
1363 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, 1467 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
1364 old_inode); 1468 old_inode);
@@ -1373,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir,
1373 else 1477 else
1374 ocfs2_add_links_count(newfe, -1); 1478 ocfs2_add_links_count(newfe, -1);
1375 ocfs2_journal_dirty(handle, newfe_bh); 1479 ocfs2_journal_dirty(handle, newfe_bh);
1480 if (should_add_orphan) {
1481 status = ocfs2_orphan_add(osb, handle, new_inode,
1482 newfe_bh, orphan_name,
1483 &orphan_insert, orphan_dir);
1484 if (status < 0) {
1485 mlog_errno(status);
1486 goto bail;
1487 }
1488 }
1376 } else { 1489 } else {
1377 /* if the name was not found in new_dir, add it now */ 1490 /* if the name was not found in new_dir, add it now */
1378 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1491 status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1642,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir,
1642 struct ocfs2_dir_lookup_result lookup = { NULL, }; 1755 struct ocfs2_dir_lookup_result lookup = { NULL, };
1643 sigset_t oldset; 1756 sigset_t oldset;
1644 int did_block_signals = 0; 1757 int did_block_signals = 0;
1758 struct ocfs2_dentry_lock *dl = NULL;
1645 1759
1646 trace_ocfs2_symlink_begin(dir, dentry, symname, 1760 trace_ocfs2_symlink_begin(dir, dentry, symname,
1647 dentry->d_name.len, dentry->d_name.name); 1761 dentry->d_name.len, dentry->d_name.name);
@@ -1830,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir,
1830 goto bail; 1944 goto bail;
1831 } 1945 }
1832 1946
1947 dl = dentry->d_fsdata;
1948
1833 status = ocfs2_add_entry(handle, dentry, inode, 1949 status = ocfs2_add_entry(handle, dentry, inode,
1834 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1950 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1835 &lookup); 1951 &lookup);
@@ -1864,6 +1980,9 @@ bail:
1864 if (xattr_ac) 1980 if (xattr_ac)
1865 ocfs2_free_alloc_context(xattr_ac); 1981 ocfs2_free_alloc_context(xattr_ac);
1866 if ((status < 0) && inode) { 1982 if ((status < 0) && inode) {
1983 if (dl)
1984 ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
1985
1867 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; 1986 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
1868 clear_nlink(inode); 1987 clear_nlink(inode);
1869 iput(inode); 1988 iput(inode);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1b60c62aa9d6..6cb019b7c6a8 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,
2292 __entry->new_len, __get_str(new_name)) 2292 __entry->new_len, __get_str(new_name))
2293); 2293);
2294 2294
2295DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
2296
2295TRACE_EVENT(ocfs2_rename_target_exists, 2297TRACE_EVENT(ocfs2_rename_target_exists,
2296 TP_PROTO(int new_len, const char *new_name), 2298 TP_PROTO(int new_len, const char *new_name),
2297 TP_ARGS(new_len, new_name), 2299 TP_ARGS(new_len, new_name),
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 714e53b9cc66..636aab69ead5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4288,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4288 goto out; 4288 goto out;
4289 } 4289 }
4290 4290
4291 error = ocfs2_rw_lock(inode, 1);
4292 if (error) {
4293 mlog_errno(error);
4294 goto out;
4295 }
4296
4291 error = ocfs2_inode_lock(inode, &old_bh, 1); 4297 error = ocfs2_inode_lock(inode, &old_bh, 1);
4292 if (error) { 4298 if (error) {
4293 mlog_errno(error); 4299 mlog_errno(error);
4300 ocfs2_rw_unlock(inode, 1);
4294 goto out; 4301 goto out;
4295 } 4302 }
4296 4303
@@ -4302,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4302 up_write(&OCFS2_I(inode)->ip_xattr_sem); 4309 up_write(&OCFS2_I(inode)->ip_xattr_sem);
4303 4310
4304 ocfs2_inode_unlock(inode, 1); 4311 ocfs2_inode_unlock(inode, 1);
4312 ocfs2_rw_unlock(inode, 1);
4305 brelse(old_bh); 4313 brelse(old_bh);
4306 4314
4307 if (error) { 4315 if (error) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c7a89cea5c5d..ddb662b32447 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1925 1925
1926 ocfs2_shutdown_local_alloc(osb); 1926 ocfs2_shutdown_local_alloc(osb);
1927 1927
1928 ocfs2_truncate_log_shutdown(osb);
1929
1928 /* This will disable recovery and flush any recovery work. */ 1930 /* This will disable recovery and flush any recovery work. */
1929 ocfs2_recovery_exit(osb); 1931 ocfs2_recovery_exit(osb);
1930 1932
1931 /*
1932 * During dismount, when it recovers another node it will call
1933 * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
1934 */
1935 ocfs2_truncate_log_shutdown(osb);
1936
1937 ocfs2_journal_shutdown(osb); 1933 ocfs2_journal_shutdown(osb);
1938 1934
1939 ocfs2_sync_blockdev(sb); 1935 ocfs2_sync_blockdev(sb);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 54d57d6ba68d..902e88527fce 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -337,10 +337,10 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
337 337
338const struct file_operations omfs_file_operations = { 338const struct file_operations omfs_file_operations = {
339 .llseek = generic_file_llseek, 339 .llseek = generic_file_llseek,
340 .read = do_sync_read, 340 .read = new_sync_read,
341 .write = do_sync_write, 341 .write = new_sync_write,
342 .aio_read = generic_file_aio_read, 342 .read_iter = generic_file_read_iter,
343 .aio_write = generic_file_aio_write, 343 .write_iter = generic_file_write_iter,
344 .mmap = generic_file_mmap, 344 .mmap = generic_file_mmap,
345 .fsync = generic_file_fsync, 345 .fsync = generic_file_fsync,
346 .splice_read = generic_file_splice_read, 346 .splice_read = generic_file_splice_read,
diff --git a/fs/open.c b/fs/open.c
index 9d64679cec73..36662d036237 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -725,6 +725,12 @@ static int do_dentry_open(struct file *f,
725 } 725 }
726 if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) 726 if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
727 i_readcount_inc(inode); 727 i_readcount_inc(inode);
728 if ((f->f_mode & FMODE_READ) &&
729 likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter))
730 f->f_mode |= FMODE_CAN_READ;
731 if ((f->f_mode & FMODE_WRITE) &&
732 likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter))
733 f->f_mode |= FMODE_CAN_WRITE;
728 734
729 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 735 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
730 736
diff --git a/fs/pipe.c b/fs/pipe.c
index 034bffac3f97..21981e58e2a6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -116,50 +116,6 @@ void pipe_wait(struct pipe_inode_info *pipe)
116 pipe_lock(pipe); 116 pipe_lock(pipe);
117} 117}
118 118
119static int
120pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
121 int atomic)
122{
123 unsigned long copy;
124
125 while (len > 0) {
126 while (!iov->iov_len)
127 iov++;
128 copy = min_t(unsigned long, len, iov->iov_len);
129
130 if (atomic) {
131 if (__copy_from_user_inatomic(to, iov->iov_base, copy))
132 return -EFAULT;
133 } else {
134 if (copy_from_user(to, iov->iov_base, copy))
135 return -EFAULT;
136 }
137 to += copy;
138 len -= copy;
139 iov->iov_base += copy;
140 iov->iov_len -= copy;
141 }
142 return 0;
143}
144
145/*
146 * Pre-fault in the user memory, so we can use atomic copies.
147 */
148static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
149{
150 while (!iov->iov_len)
151 iov++;
152
153 while (len > 0) {
154 unsigned long this_len;
155
156 this_len = min_t(unsigned long, len, iov->iov_len);
157 fault_in_pages_readable(iov->iov_base, this_len);
158 len -= this_len;
159 iov++;
160 }
161}
162
163static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 119static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
164 struct pipe_buffer *buf) 120 struct pipe_buffer *buf)
165{ 121{
@@ -271,24 +227,18 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
271}; 227};
272 228
273static ssize_t 229static ssize_t
274pipe_read(struct kiocb *iocb, const struct iovec *_iov, 230pipe_read(struct kiocb *iocb, struct iov_iter *to)
275 unsigned long nr_segs, loff_t pos)
276{ 231{
232 size_t total_len = iov_iter_count(to);
277 struct file *filp = iocb->ki_filp; 233 struct file *filp = iocb->ki_filp;
278 struct pipe_inode_info *pipe = filp->private_data; 234 struct pipe_inode_info *pipe = filp->private_data;
279 int do_wakeup; 235 int do_wakeup;
280 ssize_t ret; 236 ssize_t ret;
281 struct iovec *iov = (struct iovec *)_iov;
282 size_t total_len;
283 struct iov_iter iter;
284 237
285 total_len = iov_length(iov, nr_segs);
286 /* Null read succeeds. */ 238 /* Null read succeeds. */
287 if (unlikely(total_len == 0)) 239 if (unlikely(total_len == 0))
288 return 0; 240 return 0;
289 241
290 iov_iter_init(&iter, iov, nr_segs, total_len, 0);
291
292 do_wakeup = 0; 242 do_wakeup = 0;
293 ret = 0; 243 ret = 0;
294 __pipe_lock(pipe); 244 __pipe_lock(pipe);
@@ -312,7 +262,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
312 break; 262 break;
313 } 263 }
314 264
315 written = copy_page_to_iter(buf->page, buf->offset, chars, &iter); 265 written = copy_page_to_iter(buf->page, buf->offset, chars, to);
316 if (unlikely(written < chars)) { 266 if (unlikely(written < chars)) {
317 if (!ret) 267 if (!ret)
318 ret = -EFAULT; 268 ret = -EFAULT;
@@ -386,24 +336,19 @@ static inline int is_packetized(struct file *file)
386} 336}
387 337
388static ssize_t 338static ssize_t
389pipe_write(struct kiocb *iocb, const struct iovec *_iov, 339pipe_write(struct kiocb *iocb, struct iov_iter *from)
390 unsigned long nr_segs, loff_t ppos)
391{ 340{
392 struct file *filp = iocb->ki_filp; 341 struct file *filp = iocb->ki_filp;
393 struct pipe_inode_info *pipe = filp->private_data; 342 struct pipe_inode_info *pipe = filp->private_data;
394 ssize_t ret; 343 ssize_t ret = 0;
395 int do_wakeup; 344 int do_wakeup = 0;
396 struct iovec *iov = (struct iovec *)_iov; 345 size_t total_len = iov_iter_count(from);
397 size_t total_len;
398 ssize_t chars; 346 ssize_t chars;
399 347
400 total_len = iov_length(iov, nr_segs);
401 /* Null write succeeds. */ 348 /* Null write succeeds. */
402 if (unlikely(total_len == 0)) 349 if (unlikely(total_len == 0))
403 return 0; 350 return 0;
404 351
405 do_wakeup = 0;
406 ret = 0;
407 __pipe_lock(pipe); 352 __pipe_lock(pipe);
408 353
409 if (!pipe->readers) { 354 if (!pipe->readers) {
@@ -422,38 +367,19 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
422 int offset = buf->offset + buf->len; 367 int offset = buf->offset + buf->len;
423 368
424 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 369 if (ops->can_merge && offset + chars <= PAGE_SIZE) {
425 int error, atomic = 1; 370 int error = ops->confirm(pipe, buf);
426 void *addr;
427
428 error = ops->confirm(pipe, buf);
429 if (error) 371 if (error)
430 goto out; 372 goto out;
431 373
432 iov_fault_in_pages_read(iov, chars); 374 ret = copy_page_from_iter(buf->page, offset, chars, from);
433redo1: 375 if (unlikely(ret < chars)) {
434 if (atomic) 376 error = -EFAULT;
435 addr = kmap_atomic(buf->page);
436 else
437 addr = kmap(buf->page);
438 error = pipe_iov_copy_from_user(offset + addr, iov,
439 chars, atomic);
440 if (atomic)
441 kunmap_atomic(addr);
442 else
443 kunmap(buf->page);
444 ret = error;
445 do_wakeup = 1;
446 if (error) {
447 if (atomic) {
448 atomic = 0;
449 goto redo1;
450 }
451 goto out; 377 goto out;
452 } 378 }
379 do_wakeup = 1;
453 buf->len += chars; 380 buf->len += chars;
454 total_len -= chars;
455 ret = chars; 381 ret = chars;
456 if (!total_len) 382 if (!iov_iter_count(from))
457 goto out; 383 goto out;
458 } 384 }
459 } 385 }
@@ -472,8 +398,7 @@ redo1:
472 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 398 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
473 struct pipe_buffer *buf = pipe->bufs + newbuf; 399 struct pipe_buffer *buf = pipe->bufs + newbuf;
474 struct page *page = pipe->tmp_page; 400 struct page *page = pipe->tmp_page;
475 char *src; 401 int copied;
476 int error, atomic = 1;
477 402
478 if (!page) { 403 if (!page) {
479 page = alloc_page(GFP_HIGHUSER); 404 page = alloc_page(GFP_HIGHUSER);
@@ -489,40 +414,19 @@ redo1:
489 * FIXME! Is this really true? 414 * FIXME! Is this really true?
490 */ 415 */
491 do_wakeup = 1; 416 do_wakeup = 1;
492 chars = PAGE_SIZE; 417 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
493 if (chars > total_len) 418 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
494 chars = total_len;
495
496 iov_fault_in_pages_read(iov, chars);
497redo2:
498 if (atomic)
499 src = kmap_atomic(page);
500 else
501 src = kmap(page);
502
503 error = pipe_iov_copy_from_user(src, iov, chars,
504 atomic);
505 if (atomic)
506 kunmap_atomic(src);
507 else
508 kunmap(page);
509
510 if (unlikely(error)) {
511 if (atomic) {
512 atomic = 0;
513 goto redo2;
514 }
515 if (!ret) 419 if (!ret)
516 ret = error; 420 ret = -EFAULT;
517 break; 421 break;
518 } 422 }
519 ret += chars; 423 ret += copied;
520 424
521 /* Insert it into the buffer array */ 425 /* Insert it into the buffer array */
522 buf->page = page; 426 buf->page = page;
523 buf->ops = &anon_pipe_buf_ops; 427 buf->ops = &anon_pipe_buf_ops;
524 buf->offset = 0; 428 buf->offset = 0;
525 buf->len = chars; 429 buf->len = copied;
526 buf->flags = 0; 430 buf->flags = 0;
527 if (is_packetized(filp)) { 431 if (is_packetized(filp)) {
528 buf->ops = &packet_pipe_buf_ops; 432 buf->ops = &packet_pipe_buf_ops;
@@ -531,8 +435,7 @@ redo2:
531 pipe->nrbufs = ++bufs; 435 pipe->nrbufs = ++bufs;
532 pipe->tmp_page = NULL; 436 pipe->tmp_page = NULL;
533 437
534 total_len -= chars; 438 if (!iov_iter_count(from))
535 if (!total_len)
536 break; 439 break;
537 } 440 }
538 if (bufs < pipe->buffers) 441 if (bufs < pipe->buffers)
@@ -1044,10 +947,10 @@ err:
1044const struct file_operations pipefifo_fops = { 947const struct file_operations pipefifo_fops = {
1045 .open = fifo_open, 948 .open = fifo_open,
1046 .llseek = no_llseek, 949 .llseek = no_llseek,
1047 .read = do_sync_read, 950 .read = new_sync_read,
1048 .aio_read = pipe_read, 951 .read_iter = pipe_read,
1049 .write = do_sync_write, 952 .write = new_sync_write,
1050 .aio_write = pipe_write, 953 .write_iter = pipe_write,
1051 .poll = pipe_poll, 954 .poll = pipe_poll,
1052 .unlocked_ioctl = pipe_ioctl, 955 .unlocked_ioctl = pipe_ioctl,
1053 .release = pipe_release, 956 .release = pipe_release,
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 9d231e9e5f0e..bf2d03f8fd3e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v)
184 184
185static int stat_open(struct inode *inode, struct file *file) 185static int stat_open(struct inode *inode, struct file *file)
186{ 186{
187 size_t size = 1024 + 128 * num_possible_cpus(); 187 size_t size = 1024 + 128 * num_online_cpus();
188 char *buf;
189 struct seq_file *m;
190 int res;
191 188
192 /* minimum size to display an interrupt count : 2 bytes */ 189 /* minimum size to display an interrupt count : 2 bytes */
193 size += 2 * nr_irqs; 190 size += 2 * nr_irqs;
194 191 return single_open_size(file, show_stat, NULL, size);
195 /* don't ask for more than the kmalloc() max size */
196 if (size > KMALLOC_MAX_SIZE)
197 size = KMALLOC_MAX_SIZE;
198 buf = kmalloc(size, GFP_KERNEL);
199 if (!buf)
200 return -ENOMEM;
201
202 res = single_open(file, show_stat, NULL);
203 if (!res) {
204 m = file->private_data;
205 m->buf = buf;
206 m->size = ksize(buf);
207 } else
208 kfree(buf);
209 return res;
210} 192}
211 193
212static const struct file_operations proc_stat_operations = { 194static const struct file_operations proc_stat_operations = {
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 1e56a4e8cf7c..4f56de822d2f 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -31,14 +31,14 @@
31#include "internal.h" 31#include "internal.h"
32 32
33const struct file_operations ramfs_file_operations = { 33const struct file_operations ramfs_file_operations = {
34 .read = do_sync_read, 34 .read = new_sync_read,
35 .aio_read = generic_file_aio_read, 35 .read_iter = generic_file_read_iter,
36 .write = do_sync_write, 36 .write = new_sync_write,
37 .aio_write = generic_file_aio_write, 37 .write_iter = generic_file_write_iter,
38 .mmap = generic_file_mmap, 38 .mmap = generic_file_mmap,
39 .fsync = noop_fsync, 39 .fsync = noop_fsync,
40 .splice_read = generic_file_splice_read, 40 .splice_read = generic_file_splice_read,
41 .splice_write = generic_file_splice_write, 41 .splice_write = iter_file_splice_write,
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
43}; 43};
44 44
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0b3d8e4cb2fa..dda012ad4208 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -37,13 +37,13 @@ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
37const struct file_operations ramfs_file_operations = { 37const struct file_operations ramfs_file_operations = {
38 .mmap = ramfs_nommu_mmap, 38 .mmap = ramfs_nommu_mmap,
39 .get_unmapped_area = ramfs_nommu_get_unmapped_area, 39 .get_unmapped_area = ramfs_nommu_get_unmapped_area,
40 .read = do_sync_read, 40 .read = new_sync_read,
41 .aio_read = generic_file_aio_read, 41 .read_iter = generic_file_read_iter,
42 .write = do_sync_write, 42 .write = new_sync_write,
43 .aio_write = generic_file_aio_write, 43 .write_iter = generic_file_write_iter,
44 .fsync = noop_fsync, 44 .fsync = noop_fsync,
45 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
46 .splice_write = generic_file_splice_write, 46 .splice_write = iter_file_splice_write,
47 .llseek = generic_file_llseek, 47 .llseek = generic_file_llseek,
48}; 48};
49 49
diff --git a/fs/read_write.c b/fs/read_write.c
index 31c6efa43183..009d8542a889 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -25,11 +25,12 @@
25typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); 25typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
26typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, 26typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
27 unsigned long, loff_t); 27 unsigned long, loff_t);
28typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
28 29
29const struct file_operations generic_ro_fops = { 30const struct file_operations generic_ro_fops = {
30 .llseek = generic_file_llseek, 31 .llseek = generic_file_llseek,
31 .read = do_sync_read, 32 .read = new_sync_read,
32 .aio_read = generic_file_aio_read, 33 .read_iter = generic_file_read_iter,
33 .mmap = generic_file_readonly_mmap, 34 .mmap = generic_file_readonly_mmap,
34 .splice_read = generic_file_splice_read, 35 .splice_read = generic_file_splice_read,
35}; 36};
@@ -390,13 +391,34 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
390 391
391EXPORT_SYMBOL(do_sync_read); 392EXPORT_SYMBOL(do_sync_read);
392 393
394ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
395{
396 struct iovec iov = { .iov_base = buf, .iov_len = len };
397 struct kiocb kiocb;
398 struct iov_iter iter;
399 ssize_t ret;
400
401 init_sync_kiocb(&kiocb, filp);
402 kiocb.ki_pos = *ppos;
403 kiocb.ki_nbytes = len;
404 iov_iter_init(&iter, READ, &iov, 1, len);
405
406 ret = filp->f_op->read_iter(&kiocb, &iter);
407 if (-EIOCBQUEUED == ret)
408 ret = wait_on_sync_kiocb(&kiocb);
409 *ppos = kiocb.ki_pos;
410 return ret;
411}
412
413EXPORT_SYMBOL(new_sync_read);
414
393ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 415ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
394{ 416{
395 ssize_t ret; 417 ssize_t ret;
396 418
397 if (!(file->f_mode & FMODE_READ)) 419 if (!(file->f_mode & FMODE_READ))
398 return -EBADF; 420 return -EBADF;
399 if (!file->f_op->read && !file->f_op->aio_read) 421 if (!(file->f_mode & FMODE_CAN_READ))
400 return -EINVAL; 422 return -EINVAL;
401 if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) 423 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
402 return -EFAULT; 424 return -EFAULT;
@@ -406,8 +428,10 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
406 count = ret; 428 count = ret;
407 if (file->f_op->read) 429 if (file->f_op->read)
408 ret = file->f_op->read(file, buf, count, pos); 430 ret = file->f_op->read(file, buf, count, pos);
409 else 431 else if (file->f_op->aio_read)
410 ret = do_sync_read(file, buf, count, pos); 432 ret = do_sync_read(file, buf, count, pos);
433 else
434 ret = new_sync_read(file, buf, count, pos);
411 if (ret > 0) { 435 if (ret > 0) {
412 fsnotify_access(file); 436 fsnotify_access(file);
413 add_rchar(current, ret); 437 add_rchar(current, ret);
@@ -439,13 +463,34 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
439 463
440EXPORT_SYMBOL(do_sync_write); 464EXPORT_SYMBOL(do_sync_write);
441 465
466ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
467{
468 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
469 struct kiocb kiocb;
470 struct iov_iter iter;
471 ssize_t ret;
472
473 init_sync_kiocb(&kiocb, filp);
474 kiocb.ki_pos = *ppos;
475 kiocb.ki_nbytes = len;
476 iov_iter_init(&iter, WRITE, &iov, 1, len);
477
478 ret = filp->f_op->write_iter(&kiocb, &iter);
479 if (-EIOCBQUEUED == ret)
480 ret = wait_on_sync_kiocb(&kiocb);
481 *ppos = kiocb.ki_pos;
482 return ret;
483}
484
485EXPORT_SYMBOL(new_sync_write);
486
442ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) 487ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
443{ 488{
444 mm_segment_t old_fs; 489 mm_segment_t old_fs;
445 const char __user *p; 490 const char __user *p;
446 ssize_t ret; 491 ssize_t ret;
447 492
448 if (!file->f_op->write && !file->f_op->aio_write) 493 if (!(file->f_mode & FMODE_CAN_WRITE))
449 return -EINVAL; 494 return -EINVAL;
450 495
451 old_fs = get_fs(); 496 old_fs = get_fs();
@@ -455,8 +500,10 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
455 count = MAX_RW_COUNT; 500 count = MAX_RW_COUNT;
456 if (file->f_op->write) 501 if (file->f_op->write)
457 ret = file->f_op->write(file, p, count, pos); 502 ret = file->f_op->write(file, p, count, pos);
458 else 503 else if (file->f_op->aio_write)
459 ret = do_sync_write(file, p, count, pos); 504 ret = do_sync_write(file, p, count, pos);
505 else
506 ret = new_sync_write(file, p, count, pos);
460 set_fs(old_fs); 507 set_fs(old_fs);
461 if (ret > 0) { 508 if (ret > 0) {
462 fsnotify_modify(file); 509 fsnotify_modify(file);
@@ -472,7 +519,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
472 519
473 if (!(file->f_mode & FMODE_WRITE)) 520 if (!(file->f_mode & FMODE_WRITE))
474 return -EBADF; 521 return -EBADF;
475 if (!file->f_op->write && !file->f_op->aio_write) 522 if (!(file->f_mode & FMODE_CAN_WRITE))
476 return -EINVAL; 523 return -EINVAL;
477 if (unlikely(!access_ok(VERIFY_READ, buf, count))) 524 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
478 return -EFAULT; 525 return -EFAULT;
@@ -483,8 +530,10 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
483 file_start_write(file); 530 file_start_write(file);
484 if (file->f_op->write) 531 if (file->f_op->write)
485 ret = file->f_op->write(file, buf, count, pos); 532 ret = file->f_op->write(file, buf, count, pos);
486 else 533 else if (file->f_op->aio_write)
487 ret = do_sync_write(file, buf, count, pos); 534 ret = do_sync_write(file, buf, count, pos);
535 else
536 ret = new_sync_write(file, buf, count, pos);
488 if (ret > 0) { 537 if (ret > 0) {
489 fsnotify_modify(file); 538 fsnotify_modify(file);
490 add_wchar(current, ret); 539 add_wchar(current, ret);
@@ -601,6 +650,25 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
601} 650}
602EXPORT_SYMBOL(iov_shorten); 651EXPORT_SYMBOL(iov_shorten);
603 652
653static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
654 unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
655{
656 struct kiocb kiocb;
657 struct iov_iter iter;
658 ssize_t ret;
659
660 init_sync_kiocb(&kiocb, filp);
661 kiocb.ki_pos = *ppos;
662 kiocb.ki_nbytes = len;
663
664 iov_iter_init(&iter, rw, iov, nr_segs, len);
665 ret = fn(&kiocb, &iter);
666 if (ret == -EIOCBQUEUED)
667 ret = wait_on_sync_kiocb(&kiocb);
668 *ppos = kiocb.ki_pos;
669 return ret;
670}
671
604static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, 672static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
605 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) 673 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
606{ 674{
@@ -738,6 +806,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
738 ssize_t ret; 806 ssize_t ret;
739 io_fn_t fn; 807 io_fn_t fn;
740 iov_fn_t fnv; 808 iov_fn_t fnv;
809 iter_fn_t iter_fn;
741 810
742 ret = rw_copy_check_uvector(type, uvector, nr_segs, 811 ret = rw_copy_check_uvector(type, uvector, nr_segs,
743 ARRAY_SIZE(iovstack), iovstack, &iov); 812 ARRAY_SIZE(iovstack), iovstack, &iov);
@@ -753,13 +822,18 @@ static ssize_t do_readv_writev(int type, struct file *file,
753 if (type == READ) { 822 if (type == READ) {
754 fn = file->f_op->read; 823 fn = file->f_op->read;
755 fnv = file->f_op->aio_read; 824 fnv = file->f_op->aio_read;
825 iter_fn = file->f_op->read_iter;
756 } else { 826 } else {
757 fn = (io_fn_t)file->f_op->write; 827 fn = (io_fn_t)file->f_op->write;
758 fnv = file->f_op->aio_write; 828 fnv = file->f_op->aio_write;
829 iter_fn = file->f_op->write_iter;
759 file_start_write(file); 830 file_start_write(file);
760 } 831 }
761 832
762 if (fnv) 833 if (iter_fn)
834 ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
835 pos, iter_fn);
836 else if (fnv)
763 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, 837 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
764 pos, fnv); 838 pos, fnv);
765 else 839 else
@@ -785,7 +859,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
785{ 859{
786 if (!(file->f_mode & FMODE_READ)) 860 if (!(file->f_mode & FMODE_READ))
787 return -EBADF; 861 return -EBADF;
788 if (!file->f_op->aio_read && !file->f_op->read) 862 if (!(file->f_mode & FMODE_CAN_READ))
789 return -EINVAL; 863 return -EINVAL;
790 864
791 return do_readv_writev(READ, file, vec, vlen, pos); 865 return do_readv_writev(READ, file, vec, vlen, pos);
@@ -798,7 +872,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
798{ 872{
799 if (!(file->f_mode & FMODE_WRITE)) 873 if (!(file->f_mode & FMODE_WRITE))
800 return -EBADF; 874 return -EBADF;
801 if (!file->f_op->aio_write && !file->f_op->write) 875 if (!(file->f_mode & FMODE_CAN_WRITE))
802 return -EINVAL; 876 return -EINVAL;
803 877
804 return do_readv_writev(WRITE, file, vec, vlen, pos); 878 return do_readv_writev(WRITE, file, vec, vlen, pos);
@@ -912,6 +986,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
912 ssize_t ret; 986 ssize_t ret;
913 io_fn_t fn; 987 io_fn_t fn;
914 iov_fn_t fnv; 988 iov_fn_t fnv;
989 iter_fn_t iter_fn;
915 990
916 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, 991 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
917 UIO_FASTIOV, iovstack, &iov); 992 UIO_FASTIOV, iovstack, &iov);
@@ -927,13 +1002,18 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
927 if (type == READ) { 1002 if (type == READ) {
928 fn = file->f_op->read; 1003 fn = file->f_op->read;
929 fnv = file->f_op->aio_read; 1004 fnv = file->f_op->aio_read;
1005 iter_fn = file->f_op->read_iter;
930 } else { 1006 } else {
931 fn = (io_fn_t)file->f_op->write; 1007 fn = (io_fn_t)file->f_op->write;
932 fnv = file->f_op->aio_write; 1008 fnv = file->f_op->aio_write;
1009 iter_fn = file->f_op->write_iter;
933 file_start_write(file); 1010 file_start_write(file);
934 } 1011 }
935 1012
936 if (fnv) 1013 if (iter_fn)
1014 ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
1015 pos, iter_fn);
1016 else if (fnv)
937 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, 1017 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
938 pos, fnv); 1018 pos, fnv);
939 else 1019 else
@@ -964,7 +1044,7 @@ static size_t compat_readv(struct file *file,
964 goto out; 1044 goto out;
965 1045
966 ret = -EINVAL; 1046 ret = -EINVAL;
967 if (!file->f_op->aio_read && !file->f_op->read) 1047 if (!(file->f_mode & FMODE_CAN_READ))
968 goto out; 1048 goto out;
969 1049
970 ret = compat_do_readv_writev(READ, file, vec, vlen, pos); 1050 ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
@@ -1041,7 +1121,7 @@ static size_t compat_writev(struct file *file,
1041 goto out; 1121 goto out;
1042 1122
1043 ret = -EINVAL; 1123 ret = -EINVAL;
1044 if (!file->f_op->aio_write && !file->f_op->write) 1124 if (!(file->f_mode & FMODE_CAN_WRITE))
1045 goto out; 1125 goto out;
1046 1126
1047 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); 1127 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 5f6c32c668b6..db9e80ba53a0 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -243,8 +243,8 @@ drop_write_lock:
243} 243}
244 244
245const struct file_operations reiserfs_file_operations = { 245const struct file_operations reiserfs_file_operations = {
246 .read = do_sync_read, 246 .read = new_sync_read,
247 .write = do_sync_write, 247 .write = new_sync_write,
248 .unlocked_ioctl = reiserfs_ioctl, 248 .unlocked_ioctl = reiserfs_ioctl,
249#ifdef CONFIG_COMPAT 249#ifdef CONFIG_COMPAT
250 .compat_ioctl = reiserfs_compat_ioctl, 250 .compat_ioctl = reiserfs_compat_ioctl,
@@ -253,10 +253,10 @@ const struct file_operations reiserfs_file_operations = {
253 .open = reiserfs_file_open, 253 .open = reiserfs_file_open,
254 .release = reiserfs_file_release, 254 .release = reiserfs_file_release,
255 .fsync = reiserfs_sync_file, 255 .fsync = reiserfs_sync_file,
256 .aio_read = generic_file_aio_read, 256 .read_iter = generic_file_read_iter,
257 .aio_write = generic_file_aio_write, 257 .write_iter = generic_file_write_iter,
258 .splice_read = generic_file_splice_read, 258 .splice_read = generic_file_splice_read,
259 .splice_write = generic_file_splice_write, 259 .splice_write = iter_file_splice_write,
260 .llseek = generic_file_llseek, 260 .llseek = generic_file_llseek,
261}; 261};
262 262
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index e3ca04894919..63b2b0ec49e6 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3279,15 +3279,15 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
3279 * to do in this section of the code. 3279 * to do in this section of the code.
3280 */ 3280 */
3281static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, 3281static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3282 const struct iovec *iov, loff_t offset, 3282 struct iov_iter *iter, loff_t offset)
3283 unsigned long nr_segs)
3284{ 3283{
3285 struct file *file = iocb->ki_filp; 3284 struct file *file = iocb->ki_filp;
3286 struct inode *inode = file->f_mapping->host; 3285 struct inode *inode = file->f_mapping->host;
3286 size_t count = iov_iter_count(iter);
3287 ssize_t ret; 3287 ssize_t ret;
3288 3288
3289 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 3289 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
3290 reiserfs_get_blocks_direct_io); 3290 reiserfs_get_blocks_direct_io);
3291 3291
3292 /* 3292 /*
3293 * In case of error extending write may have instantiated a few 3293 * In case of error extending write may have instantiated a few
@@ -3295,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3295 */ 3295 */
3296 if (unlikely((rw & WRITE) && ret < 0)) { 3296 if (unlikely((rw & WRITE) && ret < 0)) {
3297 loff_t isize = i_size_read(inode); 3297 loff_t isize = i_size_read(inode);
3298 loff_t end = offset + iov_length(iov, nr_segs); 3298 loff_t end = offset + count;
3299 3299
3300 if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { 3300 if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
3301 truncate_setsize(inode, isize); 3301 truncate_setsize(inode, isize);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index f373bde8f545..ea06c7554860 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -72,8 +72,8 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
72 72
73const struct file_operations romfs_ro_fops = { 73const struct file_operations romfs_ro_fops = {
74 .llseek = generic_file_llseek, 74 .llseek = generic_file_llseek,
75 .read = do_sync_read, 75 .read = new_sync_read,
76 .aio_read = generic_file_aio_read, 76 .read_iter = generic_file_read_iter,
77 .splice_read = generic_file_splice_read, 77 .splice_read = generic_file_splice_read,
78 .mmap = romfs_mmap, 78 .mmap = romfs_mmap,
79 .get_unmapped_area = romfs_get_unmapped_area, 79 .get_unmapped_area = romfs_get_unmapped_area,
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 1d641bb108d2..3857b720cb1b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -8,8 +8,10 @@
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/export.h> 9#include <linux/export.h>
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/vmalloc.h>
11#include <linux/slab.h> 12#include <linux/slab.h>
12#include <linux/cred.h> 13#include <linux/cred.h>
14#include <linux/mm.h>
13 15
14#include <asm/uaccess.h> 16#include <asm/uaccess.h>
15#include <asm/page.h> 17#include <asm/page.h>
@@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m)
30 m->count = m->size; 32 m->count = m->size;
31} 33}
32 34
35static void *seq_buf_alloc(unsigned long size)
36{
37 void *buf;
38
39 buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
40 if (!buf && size > PAGE_SIZE)
41 buf = vmalloc(size);
42 return buf;
43}
44
33/** 45/**
34 * seq_open - initialize sequential file 46 * seq_open - initialize sequential file
35 * @file: file we initialize 47 * @file: file we initialize
@@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset)
96 return 0; 108 return 0;
97 } 109 }
98 if (!m->buf) { 110 if (!m->buf) {
99 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); 111 m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
100 if (!m->buf) 112 if (!m->buf)
101 return -ENOMEM; 113 return -ENOMEM;
102 } 114 }
@@ -135,9 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset)
135 147
136Eoverflow: 148Eoverflow:
137 m->op->stop(m, p); 149 m->op->stop(m, p);
138 kfree(m->buf); 150 kvfree(m->buf);
139 m->count = 0; 151 m->count = 0;
140 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); 152 m->buf = seq_buf_alloc(m->size <<= 1);
141 return !m->buf ? -ENOMEM : -EAGAIN; 153 return !m->buf ? -ENOMEM : -EAGAIN;
142} 154}
143 155
@@ -192,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
192 204
193 /* grab buffer if we didn't have one */ 205 /* grab buffer if we didn't have one */
194 if (!m->buf) { 206 if (!m->buf) {
195 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); 207 m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
196 if (!m->buf) 208 if (!m->buf)
197 goto Enomem; 209 goto Enomem;
198 } 210 }
@@ -232,9 +244,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
232 if (m->count < m->size) 244 if (m->count < m->size)
233 goto Fill; 245 goto Fill;
234 m->op->stop(m, p); 246 m->op->stop(m, p);
235 kfree(m->buf); 247 kvfree(m->buf);
236 m->count = 0; 248 m->count = 0;
237 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); 249 m->buf = seq_buf_alloc(m->size <<= 1);
238 if (!m->buf) 250 if (!m->buf)
239 goto Enomem; 251 goto Enomem;
240 m->version = 0; 252 m->version = 0;
@@ -350,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek);
350int seq_release(struct inode *inode, struct file *file) 362int seq_release(struct inode *inode, struct file *file)
351{ 363{
352 struct seq_file *m = file->private_data; 364 struct seq_file *m = file->private_data;
353 kfree(m->buf); 365 kvfree(m->buf);
354 kfree(m); 366 kfree(m);
355 return 0; 367 return 0;
356} 368}
@@ -605,13 +617,13 @@ EXPORT_SYMBOL(single_open);
605int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), 617int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
606 void *data, size_t size) 618 void *data, size_t size)
607{ 619{
608 char *buf = kmalloc(size, GFP_KERNEL); 620 char *buf = seq_buf_alloc(size);
609 int ret; 621 int ret;
610 if (!buf) 622 if (!buf)
611 return -ENOMEM; 623 return -ENOMEM;
612 ret = single_open(file, show, data); 624 ret = single_open(file, show, data);
613 if (ret) { 625 if (ret) {
614 kfree(buf); 626 kvfree(buf);
615 return ret; 627 return ret;
616 } 628 }
617 ((struct seq_file *)file->private_data)->buf = buf; 629 ((struct seq_file *)file->private_data)->buf = buf;
diff --git a/fs/splice.c b/fs/splice.c
index e246954ea48c..f5cb9ba84510 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,6 +32,7 @@
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/socket.h> 33#include <linux/socket.h>
34#include <linux/compat.h> 34#include <linux/compat.h>
35#include <linux/aio.h>
35#include "internal.h" 36#include "internal.h"
36 37
37/* 38/*
@@ -717,63 +718,6 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
717 sd->len, &pos, more); 718 sd->len, &pos, more);
718} 719}
719 720
720/*
721 * This is a little more tricky than the file -> pipe splicing. There are
722 * basically three cases:
723 *
724 * - Destination page already exists in the address space and there
725 * are users of it. For that case we have no other option that
726 * copying the data. Tough luck.
727 * - Destination page already exists in the address space, but there
728 * are no users of it. Make sure it's uptodate, then drop it. Fall
729 * through to last case.
730 * - Destination page does not exist, we can add the pipe page to
731 * the page cache and avoid the copy.
732 *
733 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
734 * sd->flags), we attempt to migrate pages from the pipe to the output
735 * file address space page cache. This is possible if no one else has
736 * the pipe page referenced outside of the pipe and page cache. If
737 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
738 * a new page in the output file page cache and fill/dirty that.
739 */
740int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
741 struct splice_desc *sd)
742{
743 struct file *file = sd->u.file;
744 struct address_space *mapping = file->f_mapping;
745 unsigned int offset, this_len;
746 struct page *page;
747 void *fsdata;
748 int ret;
749
750 offset = sd->pos & ~PAGE_CACHE_MASK;
751
752 this_len = sd->len;
753 if (this_len + offset > PAGE_CACHE_SIZE)
754 this_len = PAGE_CACHE_SIZE - offset;
755
756 ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
757 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
758 if (unlikely(ret))
759 goto out;
760
761 if (buf->page != page) {
762 char *src = kmap_atomic(buf->page);
763 char *dst = kmap_atomic(page);
764
765 memcpy(dst + offset, src + buf->offset, this_len);
766 flush_dcache_page(page);
767 kunmap_atomic(dst);
768 kunmap_atomic(src);
769 }
770 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
771 page, fsdata);
772out:
773 return ret;
774}
775EXPORT_SYMBOL(pipe_to_file);
776
777static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 721static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
778{ 722{
779 smp_mb(); 723 smp_mb();
@@ -802,7 +746,7 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
802 * locking is required around copying the pipe buffers to the 746 * locking is required around copying the pipe buffers to the
803 * destination. 747 * destination.
804 */ 748 */
805int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 749static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
806 splice_actor *actor) 750 splice_actor *actor)
807{ 751{
808 int ret; 752 int ret;
@@ -849,7 +793,6 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
849 793
850 return 1; 794 return 1;
851} 795}
852EXPORT_SYMBOL(splice_from_pipe_feed);
853 796
854/** 797/**
855 * splice_from_pipe_next - wait for some data to splice from 798 * splice_from_pipe_next - wait for some data to splice from
@@ -861,7 +804,7 @@ EXPORT_SYMBOL(splice_from_pipe_feed);
861 * value (one) if pipe buffers are available. It will return zero 804 * value (one) if pipe buffers are available. It will return zero
862 * or -errno if no more data needs to be spliced. 805 * or -errno if no more data needs to be spliced.
863 */ 806 */
864int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 807static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
865{ 808{
866 while (!pipe->nrbufs) { 809 while (!pipe->nrbufs) {
867 if (!pipe->writers) 810 if (!pipe->writers)
@@ -886,7 +829,6 @@ int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
886 829
887 return 1; 830 return 1;
888} 831}
889EXPORT_SYMBOL(splice_from_pipe_next);
890 832
891/** 833/**
892 * splice_from_pipe_begin - start splicing from pipe 834 * splice_from_pipe_begin - start splicing from pipe
@@ -897,12 +839,11 @@ EXPORT_SYMBOL(splice_from_pipe_next);
897 * splice_from_pipe_next() and splice_from_pipe_feed() to 839 * splice_from_pipe_next() and splice_from_pipe_feed() to
898 * initialize the necessary fields of @sd. 840 * initialize the necessary fields of @sd.
899 */ 841 */
900void splice_from_pipe_begin(struct splice_desc *sd) 842static void splice_from_pipe_begin(struct splice_desc *sd)
901{ 843{
902 sd->num_spliced = 0; 844 sd->num_spliced = 0;
903 sd->need_wakeup = false; 845 sd->need_wakeup = false;
904} 846}
905EXPORT_SYMBOL(splice_from_pipe_begin);
906 847
907/** 848/**
908 * splice_from_pipe_end - finish splicing from pipe 849 * splice_from_pipe_end - finish splicing from pipe
@@ -914,12 +855,11 @@ EXPORT_SYMBOL(splice_from_pipe_begin);
914 * be called after a loop containing splice_from_pipe_next() and 855 * be called after a loop containing splice_from_pipe_next() and
915 * splice_from_pipe_feed(). 856 * splice_from_pipe_feed().
916 */ 857 */
917void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 858static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
918{ 859{
919 if (sd->need_wakeup) 860 if (sd->need_wakeup)
920 wakeup_pipe_writers(pipe); 861 wakeup_pipe_writers(pipe);
921} 862}
922EXPORT_SYMBOL(splice_from_pipe_end);
923 863
924/** 864/**
925 * __splice_from_pipe - splice data from a pipe to given actor 865 * __splice_from_pipe - splice data from a pipe to given actor
@@ -985,7 +925,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
985} 925}
986 926
987/** 927/**
988 * generic_file_splice_write - splice data from a pipe to a file 928 * iter_file_splice_write - splice data from a pipe to a file
989 * @pipe: pipe info 929 * @pipe: pipe info
990 * @out: file to write to 930 * @out: file to write to
991 * @ppos: position in @out 931 * @ppos: position in @out
@@ -995,40 +935,122 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
995 * Description: 935 * Description:
996 * Will either move or copy pages (determined by @flags options) from 936 * Will either move or copy pages (determined by @flags options) from
997 * the given pipe inode to the given file. 937 * the given pipe inode to the given file.
938 * This one is ->write_iter-based.
998 * 939 *
999 */ 940 */
1000ssize_t 941ssize_t
1001generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 942iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1002 loff_t *ppos, size_t len, unsigned int flags) 943 loff_t *ppos, size_t len, unsigned int flags)
1003{ 944{
1004 struct address_space *mapping = out->f_mapping;
1005 struct inode *inode = mapping->host;
1006 struct splice_desc sd = { 945 struct splice_desc sd = {
1007 .total_len = len, 946 .total_len = len,
1008 .flags = flags, 947 .flags = flags,
1009 .pos = *ppos, 948 .pos = *ppos,
1010 .u.file = out, 949 .u.file = out,
1011 }; 950 };
951 int nbufs = pipe->buffers;
952 struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
953 GFP_KERNEL);
1012 ssize_t ret; 954 ssize_t ret;
1013 955
956 if (unlikely(!array))
957 return -ENOMEM;
958
1014 pipe_lock(pipe); 959 pipe_lock(pipe);
1015 960
1016 splice_from_pipe_begin(&sd); 961 splice_from_pipe_begin(&sd);
1017 do { 962 while (sd.total_len) {
963 struct iov_iter from;
964 struct kiocb kiocb;
965 size_t left;
966 int n, idx;
967
1018 ret = splice_from_pipe_next(pipe, &sd); 968 ret = splice_from_pipe_next(pipe, &sd);
1019 if (ret <= 0) 969 if (ret <= 0)
1020 break; 970 break;
1021 971
1022 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 972 if (unlikely(nbufs < pipe->buffers)) {
1023 ret = file_remove_suid(out); 973 kfree(array);
1024 if (!ret) { 974 nbufs = pipe->buffers;
1025 ret = file_update_time(out); 975 array = kcalloc(nbufs, sizeof(struct bio_vec),
1026 if (!ret) 976 GFP_KERNEL);
1027 ret = splice_from_pipe_feed(pipe, &sd, 977 if (!array) {
1028 pipe_to_file); 978 ret = -ENOMEM;
979 break;
980 }
1029 } 981 }
1030 mutex_unlock(&inode->i_mutex); 982
1031 } while (ret > 0); 983 /* build the vector */
984 left = sd.total_len;
985 for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
986 struct pipe_buffer *buf = pipe->bufs + idx;
987 size_t this_len = buf->len;
988
989 if (this_len > left)
990 this_len = left;
991
992 if (idx == pipe->buffers - 1)
993 idx = -1;
994
995 ret = buf->ops->confirm(pipe, buf);
996 if (unlikely(ret)) {
997 if (ret == -ENODATA)
998 ret = 0;
999 goto done;
1000 }
1001
1002 array[n].bv_page = buf->page;
1003 array[n].bv_len = this_len;
1004 array[n].bv_offset = buf->offset;
1005 left -= this_len;
1006 }
1007
1008 /* ... iov_iter */
1009 from.type = ITER_BVEC | WRITE;
1010 from.bvec = array;
1011 from.nr_segs = n;
1012 from.count = sd.total_len - left;
1013 from.iov_offset = 0;
1014
1015 /* ... and iocb */
1016 init_sync_kiocb(&kiocb, out);
1017 kiocb.ki_pos = sd.pos;
1018 kiocb.ki_nbytes = sd.total_len - left;
1019
1020 /* now, send it */
1021 ret = out->f_op->write_iter(&kiocb, &from);
1022 if (-EIOCBQUEUED == ret)
1023 ret = wait_on_sync_kiocb(&kiocb);
1024
1025 if (ret <= 0)
1026 break;
1027
1028 sd.num_spliced += ret;
1029 sd.total_len -= ret;
1030 *ppos = sd.pos = kiocb.ki_pos;
1031
1032 /* dismiss the fully eaten buffers, adjust the partial one */
1033 while (ret) {
1034 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
1035 if (ret >= buf->len) {
1036 const struct pipe_buf_operations *ops = buf->ops;
1037 ret -= buf->len;
1038 buf->len = 0;
1039 buf->ops = NULL;
1040 ops->release(pipe, buf);
1041 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1042 pipe->nrbufs--;
1043 if (pipe->files)
1044 sd.need_wakeup = true;
1045 } else {
1046 buf->offset += ret;
1047 buf->len -= ret;
1048 ret = 0;
1049 }
1050 }
1051 }
1052done:
1053 kfree(array);
1032 splice_from_pipe_end(pipe, &sd); 1054 splice_from_pipe_end(pipe, &sd);
1033 1055
1034 pipe_unlock(pipe); 1056 pipe_unlock(pipe);
@@ -1036,21 +1058,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1036 if (sd.num_spliced) 1058 if (sd.num_spliced)
1037 ret = sd.num_spliced; 1059 ret = sd.num_spliced;
1038 1060
1039 if (ret > 0) {
1040 int err;
1041
1042 err = generic_write_sync(out, *ppos, ret);
1043 if (err)
1044 ret = err;
1045 else
1046 *ppos += ret;
1047 balance_dirty_pages_ratelimited(mapping);
1048 }
1049
1050 return ret; 1061 return ret;
1051} 1062}
1052 1063
1053EXPORT_SYMBOL(generic_file_splice_write); 1064EXPORT_SYMBOL(iter_file_splice_write);
1054 1065
1055static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1066static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1056 struct splice_desc *sd) 1067 struct splice_desc *sd)
@@ -1549,7 +1560,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1549 goto out; 1560 goto out;
1550 1561
1551 count = ret; 1562 count = ret;
1552 iov_iter_init(&iter, iov, nr_segs, count, 0); 1563 iov_iter_init(&iter, READ, iov, nr_segs, count);
1553 1564
1554 sd.len = 0; 1565 sd.len = 0;
1555 sd.total_len = count; 1566 sd.total_len = count;
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 9d4dc6831792..b00811c75b24 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -21,10 +21,10 @@
21 */ 21 */
22const struct file_operations sysv_file_operations = { 22const struct file_operations sysv_file_operations = {
23 .llseek = generic_file_llseek, 23 .llseek = generic_file_llseek,
24 .read = do_sync_read, 24 .read = new_sync_read,
25 .aio_read = generic_file_aio_read, 25 .read_iter = generic_file_read_iter,
26 .write = do_sync_write, 26 .write = new_sync_write,
27 .aio_write = generic_file_aio_write, 27 .write_iter = generic_file_write_iter,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = generic_file_fsync, 29 .fsync = generic_file_fsync,
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ab7f7dfb98b..b5b593c45270 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1364,17 +1364,17 @@ static inline int mctime_update_needed(const struct inode *inode,
1364 1364
1365/** 1365/**
1366 * update_ctime - update mtime and ctime of an inode. 1366 * update_ctime - update mtime and ctime of an inode.
1367 * @c: UBIFS file-system description object
1368 * @inode: inode to update 1367 * @inode: inode to update
1369 * 1368 *
1370 * This function updates mtime and ctime of the inode if it is not equivalent to 1369 * This function updates mtime and ctime of the inode if it is not equivalent to
1371 * current time. Returns zero in case of success and a negative error code in 1370 * current time. Returns zero in case of success and a negative error code in
1372 * case of failure. 1371 * case of failure.
1373 */ 1372 */
1374static int update_mctime(struct ubifs_info *c, struct inode *inode) 1373static int update_mctime(struct inode *inode)
1375{ 1374{
1376 struct timespec now = ubifs_current_time(inode); 1375 struct timespec now = ubifs_current_time(inode);
1377 struct ubifs_inode *ui = ubifs_inode(inode); 1376 struct ubifs_inode *ui = ubifs_inode(inode);
1377 struct ubifs_info *c = inode->i_sb->s_fs_info;
1378 1378
1379 if (mctime_update_needed(inode, &now)) { 1379 if (mctime_update_needed(inode, &now)) {
1380 int err, release; 1380 int err, release;
@@ -1397,18 +1397,13 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)
1397 return 0; 1397 return 0;
1398} 1398}
1399 1399
1400static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, 1400static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
1401 unsigned long nr_segs, loff_t pos)
1402{ 1401{
1403 int err; 1402 int err = update_mctime(file_inode(iocb->ki_filp));
1404 struct inode *inode = iocb->ki_filp->f_mapping->host;
1405 struct ubifs_info *c = inode->i_sb->s_fs_info;
1406
1407 err = update_mctime(c, inode);
1408 if (err) 1403 if (err)
1409 return err; 1404 return err;
1410 1405
1411 return generic_file_aio_write(iocb, iov, nr_segs, pos); 1406 return generic_file_write_iter(iocb, from);
1412} 1407}
1413 1408
1414static int ubifs_set_page_dirty(struct page *page) 1409static int ubifs_set_page_dirty(struct page *page)
@@ -1582,15 +1577,15 @@ const struct inode_operations ubifs_symlink_inode_operations = {
1582 1577
1583const struct file_operations ubifs_file_operations = { 1578const struct file_operations ubifs_file_operations = {
1584 .llseek = generic_file_llseek, 1579 .llseek = generic_file_llseek,
1585 .read = do_sync_read, 1580 .read = new_sync_read,
1586 .write = do_sync_write, 1581 .write = new_sync_write,
1587 .aio_read = generic_file_aio_read, 1582 .read_iter = generic_file_read_iter,
1588 .aio_write = ubifs_aio_write, 1583 .write_iter = ubifs_write_iter,
1589 .mmap = ubifs_file_mmap, 1584 .mmap = ubifs_file_mmap,
1590 .fsync = ubifs_fsync, 1585 .fsync = ubifs_fsync,
1591 .unlocked_ioctl = ubifs_ioctl, 1586 .unlocked_ioctl = ubifs_ioctl,
1592 .splice_read = generic_file_splice_read, 1587 .splice_read = generic_file_splice_read,
1593 .splice_write = generic_file_splice_write, 1588 .splice_write = iter_file_splice_write,
1594#ifdef CONFIG_COMPAT 1589#ifdef CONFIG_COMPAT
1595 .compat_ioctl = ubifs_compat_ioctl, 1590 .compat_ioctl = ubifs_compat_ioctl,
1596#endif 1591#endif
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d2c170f8b035..d80738fdf424 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -119,8 +119,8 @@ static int udf_adinicb_write_end(struct file *file,
119} 119}
120 120
121static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb, 121static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
122 const struct iovec *iov, 122 struct iov_iter *iter,
123 loff_t offset, unsigned long nr_segs) 123 loff_t offset)
124{ 124{
125 /* Fallback to buffered I/O. */ 125 /* Fallback to buffered I/O. */
126 return 0; 126 return 0;
@@ -134,8 +134,7 @@ const struct address_space_operations udf_adinicb_aops = {
134 .direct_IO = udf_adinicb_direct_IO, 134 .direct_IO = udf_adinicb_direct_IO,
135}; 135};
136 136
137static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 137static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
138 unsigned long nr_segs, loff_t ppos)
139{ 138{
140 ssize_t retval; 139 ssize_t retval;
141 struct file *file = iocb->ki_filp; 140 struct file *file = iocb->ki_filp;
@@ -150,7 +149,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
150 if (file->f_flags & O_APPEND) 149 if (file->f_flags & O_APPEND)
151 pos = inode->i_size; 150 pos = inode->i_size;
152 else 151 else
153 pos = ppos; 152 pos = iocb->ki_pos;
154 153
155 if (inode->i_sb->s_blocksize < 154 if (inode->i_sb->s_blocksize <
156 (udf_file_entry_alloc_offset(inode) + 155 (udf_file_entry_alloc_offset(inode) +
@@ -171,7 +170,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
171 } else 170 } else
172 up_write(&iinfo->i_data_sem); 171 up_write(&iinfo->i_data_sem);
173 172
174 retval = __generic_file_aio_write(iocb, iov, nr_segs); 173 retval = __generic_file_write_iter(iocb, from);
175 mutex_unlock(&inode->i_mutex); 174 mutex_unlock(&inode->i_mutex);
176 175
177 if (retval > 0) { 176 if (retval > 0) {
@@ -252,13 +251,13 @@ static int udf_release_file(struct inode *inode, struct file *filp)
252} 251}
253 252
254const struct file_operations udf_file_operations = { 253const struct file_operations udf_file_operations = {
255 .read = do_sync_read, 254 .read = new_sync_read,
256 .aio_read = generic_file_aio_read, 255 .read_iter = generic_file_read_iter,
257 .unlocked_ioctl = udf_ioctl, 256 .unlocked_ioctl = udf_ioctl,
258 .open = generic_file_open, 257 .open = generic_file_open,
259 .mmap = generic_file_mmap, 258 .mmap = generic_file_mmap,
260 .write = do_sync_write, 259 .write = new_sync_write,
261 .aio_write = udf_file_aio_write, 260 .write_iter = udf_file_write_iter,
262 .release = udf_release_file, 261 .release = udf_release_file,
263 .fsync = generic_file_fsync, 262 .fsync = generic_file_fsync,
264 .splice_read = generic_file_splice_read, 263 .splice_read = generic_file_splice_read,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5d643706212f..236cd48184c2 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -217,18 +217,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
217} 217}
218 218
219static ssize_t udf_direct_IO(int rw, struct kiocb *iocb, 219static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
220 const struct iovec *iov, 220 struct iov_iter *iter,
221 loff_t offset, unsigned long nr_segs) 221 loff_t offset)
222{ 222{
223 struct file *file = iocb->ki_filp; 223 struct file *file = iocb->ki_filp;
224 struct address_space *mapping = file->f_mapping; 224 struct address_space *mapping = file->f_mapping;
225 struct inode *inode = mapping->host; 225 struct inode *inode = mapping->host;
226 size_t count = iov_iter_count(iter);
226 ssize_t ret; 227 ssize_t ret;
227 228
228 ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 229 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, udf_get_block);
229 udf_get_block);
230 if (unlikely(ret < 0 && (rw & WRITE))) 230 if (unlikely(ret < 0 && (rw & WRITE)))
231 udf_write_failed(mapping, offset + iov_length(iov, nr_segs)); 231 udf_write_failed(mapping, offset + count);
232 return ret; 232 return ret;
233} 233}
234 234
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 33afa20d4509..c84ec010a676 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -35,10 +35,10 @@
35 35
36const struct file_operations ufs_file_operations = { 36const struct file_operations ufs_file_operations = {
37 .llseek = generic_file_llseek, 37 .llseek = generic_file_llseek,
38 .read = do_sync_read, 38 .read = new_sync_read,
39 .aio_read = generic_file_aio_read, 39 .read_iter = generic_file_read_iter,
40 .write = do_sync_write, 40 .write = new_sync_write,
41 .aio_write = generic_file_aio_write, 41 .write_iter = generic_file_write_iter,
42 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
43 .open = generic_file_open, 43 .open = generic_file_open,
44 .fsync = generic_file_fsync, 44 .fsync = generic_file_fsync,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e32640eedea6..faaf716e2080 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1486,9 +1486,8 @@ STATIC ssize_t
1486xfs_vm_direct_IO( 1486xfs_vm_direct_IO(
1487 int rw, 1487 int rw,
1488 struct kiocb *iocb, 1488 struct kiocb *iocb,
1489 const struct iovec *iov, 1489 struct iov_iter *iter,
1490 loff_t offset, 1490 loff_t offset)
1491 unsigned long nr_segs)
1492{ 1491{
1493 struct inode *inode = iocb->ki_filp->f_mapping->host; 1492 struct inode *inode = iocb->ki_filp->f_mapping->host;
1494 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1493 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
@@ -1496,7 +1495,7 @@ xfs_vm_direct_IO(
1496 ssize_t ret; 1495 ssize_t ret;
1497 1496
1498 if (rw & WRITE) { 1497 if (rw & WRITE) {
1499 size_t size = iov_length(iov, nr_segs); 1498 size_t size = iov_iter_count(iter);
1500 1499
1501 /* 1500 /*
1502 * We cannot preallocate a size update transaction here as we 1501 * We cannot preallocate a size update transaction here as we
@@ -1508,17 +1507,15 @@ xfs_vm_direct_IO(
1508 if (offset + size > XFS_I(inode)->i_d.di_size) 1507 if (offset + size > XFS_I(inode)->i_d.di_size)
1509 ioend->io_isdirect = 1; 1508 ioend->io_isdirect = 1;
1510 1509
1511 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1510 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1512 offset, nr_segs, 1511 offset, xfs_get_blocks_direct,
1513 xfs_get_blocks_direct,
1514 xfs_end_io_direct_write, NULL, 1512 xfs_end_io_direct_write, NULL,
1515 DIO_ASYNC_EXTEND); 1513 DIO_ASYNC_EXTEND);
1516 if (ret != -EIOCBQUEUED && iocb->private) 1514 if (ret != -EIOCBQUEUED && iocb->private)
1517 goto out_destroy_ioend; 1515 goto out_destroy_ioend;
1518 } else { 1516 } else {
1519 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1517 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1520 offset, nr_segs, 1518 offset, xfs_get_blocks_direct,
1521 xfs_get_blocks_direct,
1522 NULL, NULL, 0); 1519 NULL, NULL, 0);
1523 } 1520 }
1524 1521
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1b8160dc04d1..1f66779d7a46 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -229,34 +229,27 @@ xfs_file_fsync(
229} 229}
230 230
231STATIC ssize_t 231STATIC ssize_t
232xfs_file_aio_read( 232xfs_file_read_iter(
233 struct kiocb *iocb, 233 struct kiocb *iocb,
234 const struct iovec *iovp, 234 struct iov_iter *to)
235 unsigned long nr_segs,
236 loff_t pos)
237{ 235{
238 struct file *file = iocb->ki_filp; 236 struct file *file = iocb->ki_filp;
239 struct inode *inode = file->f_mapping->host; 237 struct inode *inode = file->f_mapping->host;
240 struct xfs_inode *ip = XFS_I(inode); 238 struct xfs_inode *ip = XFS_I(inode);
241 struct xfs_mount *mp = ip->i_mount; 239 struct xfs_mount *mp = ip->i_mount;
242 size_t size = 0; 240 size_t size = iov_iter_count(to);
243 ssize_t ret = 0; 241 ssize_t ret = 0;
244 int ioflags = 0; 242 int ioflags = 0;
245 xfs_fsize_t n; 243 xfs_fsize_t n;
244 loff_t pos = iocb->ki_pos;
246 245
247 XFS_STATS_INC(xs_read_calls); 246 XFS_STATS_INC(xs_read_calls);
248 247
249 BUG_ON(iocb->ki_pos != pos);
250
251 if (unlikely(file->f_flags & O_DIRECT)) 248 if (unlikely(file->f_flags & O_DIRECT))
252 ioflags |= IO_ISDIRECT; 249 ioflags |= IO_ISDIRECT;
253 if (file->f_mode & FMODE_NOCMTIME) 250 if (file->f_mode & FMODE_NOCMTIME)
254 ioflags |= IO_INVIS; 251 ioflags |= IO_INVIS;
255 252
256 ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
257 if (ret < 0)
258 return ret;
259
260 if (unlikely(ioflags & IO_ISDIRECT)) { 253 if (unlikely(ioflags & IO_ISDIRECT)) {
261 xfs_buftarg_t *target = 254 xfs_buftarg_t *target =
262 XFS_IS_REALTIME_INODE(ip) ? 255 XFS_IS_REALTIME_INODE(ip) ?
@@ -309,7 +302,7 @@ xfs_file_aio_read(
309 302
310 trace_xfs_file_read(ip, size, pos, ioflags); 303 trace_xfs_file_read(ip, size, pos, ioflags);
311 304
312 ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); 305 ret = generic_file_read_iter(iocb, to);
313 if (ret > 0) 306 if (ret > 0)
314 XFS_STATS_ADD(xs_read_bytes, ret); 307 XFS_STATS_ADD(xs_read_bytes, ret);
315 308
@@ -350,47 +343,6 @@ xfs_file_splice_read(
350} 343}
351 344
352/* 345/*
353 * xfs_file_splice_write() does not use xfs_rw_ilock() because
354 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
355 * couuld cause lock inversions between the aio_write path and the splice path
356 * if someone is doing concurrent splice(2) based writes and write(2) based
357 * writes to the same inode. The only real way to fix this is to re-implement
358 * the generic code here with correct locking orders.
359 */
360STATIC ssize_t
361xfs_file_splice_write(
362 struct pipe_inode_info *pipe,
363 struct file *outfilp,
364 loff_t *ppos,
365 size_t count,
366 unsigned int flags)
367{
368 struct inode *inode = outfilp->f_mapping->host;
369 struct xfs_inode *ip = XFS_I(inode);
370 int ioflags = 0;
371 ssize_t ret;
372
373 XFS_STATS_INC(xs_write_calls);
374
375 if (outfilp->f_mode & FMODE_NOCMTIME)
376 ioflags |= IO_INVIS;
377
378 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
379 return -EIO;
380
381 xfs_ilock(ip, XFS_IOLOCK_EXCL);
382
383 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
384
385 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
386 if (ret > 0)
387 XFS_STATS_ADD(xs_write_bytes, ret);
388
389 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
390 return ret;
391}
392
393/*
394 * This routine is called to handle zeroing any space in the last block of the 346 * This routine is called to handle zeroing any space in the last block of the
395 * file that is beyond the EOF. We do this since the size is being increased 347 * file that is beyond the EOF. We do this since the size is being increased
396 * without writing anything to that block and we don't want to read the 348 * without writing anything to that block and we don't want to read the
@@ -625,10 +577,7 @@ restart:
625STATIC ssize_t 577STATIC ssize_t
626xfs_file_dio_aio_write( 578xfs_file_dio_aio_write(
627 struct kiocb *iocb, 579 struct kiocb *iocb,
628 const struct iovec *iovp, 580 struct iov_iter *from)
629 unsigned long nr_segs,
630 loff_t pos,
631 size_t ocount)
632{ 581{
633 struct file *file = iocb->ki_filp; 582 struct file *file = iocb->ki_filp;
634 struct address_space *mapping = file->f_mapping; 583 struct address_space *mapping = file->f_mapping;
@@ -636,9 +585,10 @@ xfs_file_dio_aio_write(
636 struct xfs_inode *ip = XFS_I(inode); 585 struct xfs_inode *ip = XFS_I(inode);
637 struct xfs_mount *mp = ip->i_mount; 586 struct xfs_mount *mp = ip->i_mount;
638 ssize_t ret = 0; 587 ssize_t ret = 0;
639 size_t count = ocount;
640 int unaligned_io = 0; 588 int unaligned_io = 0;
641 int iolock; 589 int iolock;
590 size_t count = iov_iter_count(from);
591 loff_t pos = iocb->ki_pos;
642 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 592 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
643 mp->m_rtdev_targp : mp->m_ddev_targp; 593 mp->m_rtdev_targp : mp->m_ddev_targp;
644 594
@@ -677,6 +627,7 @@ xfs_file_dio_aio_write(
677 ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); 627 ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
678 if (ret) 628 if (ret)
679 goto out; 629 goto out;
630 iov_iter_truncate(from, count);
680 631
681 if (mapping->nrpages) { 632 if (mapping->nrpages) {
682 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 633 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
@@ -698,8 +649,7 @@ xfs_file_dio_aio_write(
698 } 649 }
699 650
700 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 651 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
701 ret = generic_file_direct_write(iocb, iovp, 652 ret = generic_file_direct_write(iocb, from, pos);
702 &nr_segs, pos, count, ocount);
703 653
704out: 654out:
705 xfs_rw_iunlock(ip, iolock); 655 xfs_rw_iunlock(ip, iolock);
@@ -712,10 +662,7 @@ out:
712STATIC ssize_t 662STATIC ssize_t
713xfs_file_buffered_aio_write( 663xfs_file_buffered_aio_write(
714 struct kiocb *iocb, 664 struct kiocb *iocb,
715 const struct iovec *iovp, 665 struct iov_iter *from)
716 unsigned long nr_segs,
717 loff_t pos,
718 size_t count)
719{ 666{
720 struct file *file = iocb->ki_filp; 667 struct file *file = iocb->ki_filp;
721 struct address_space *mapping = file->f_mapping; 668 struct address_space *mapping = file->f_mapping;
@@ -724,7 +671,8 @@ xfs_file_buffered_aio_write(
724 ssize_t ret; 671 ssize_t ret;
725 int enospc = 0; 672 int enospc = 0;
726 int iolock = XFS_IOLOCK_EXCL; 673 int iolock = XFS_IOLOCK_EXCL;
727 struct iov_iter from; 674 loff_t pos = iocb->ki_pos;
675 size_t count = iov_iter_count(from);
728 676
729 xfs_rw_ilock(ip, iolock); 677 xfs_rw_ilock(ip, iolock);
730 678
@@ -732,13 +680,13 @@ xfs_file_buffered_aio_write(
732 if (ret) 680 if (ret)
733 goto out; 681 goto out;
734 682
735 iov_iter_init(&from, iovp, nr_segs, count, 0); 683 iov_iter_truncate(from, count);
736 /* We can write back this queue in page reclaim */ 684 /* We can write back this queue in page reclaim */
737 current->backing_dev_info = mapping->backing_dev_info; 685 current->backing_dev_info = mapping->backing_dev_info;
738 686
739write_retry: 687write_retry:
740 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 688 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
741 ret = generic_perform_write(file, &from, pos); 689 ret = generic_perform_write(file, from, pos);
742 if (likely(ret >= 0)) 690 if (likely(ret >= 0))
743 iocb->ki_pos = pos + ret; 691 iocb->ki_pos = pos + ret;
744 /* 692 /*
@@ -759,40 +707,29 @@ out:
759} 707}
760 708
761STATIC ssize_t 709STATIC ssize_t
762xfs_file_aio_write( 710xfs_file_write_iter(
763 struct kiocb *iocb, 711 struct kiocb *iocb,
764 const struct iovec *iovp, 712 struct iov_iter *from)
765 unsigned long nr_segs,
766 loff_t pos)
767{ 713{
768 struct file *file = iocb->ki_filp; 714 struct file *file = iocb->ki_filp;
769 struct address_space *mapping = file->f_mapping; 715 struct address_space *mapping = file->f_mapping;
770 struct inode *inode = mapping->host; 716 struct inode *inode = mapping->host;
771 struct xfs_inode *ip = XFS_I(inode); 717 struct xfs_inode *ip = XFS_I(inode);
772 ssize_t ret; 718 ssize_t ret;
773 size_t ocount = 0; 719 size_t ocount = iov_iter_count(from);
774 720
775 XFS_STATS_INC(xs_write_calls); 721 XFS_STATS_INC(xs_write_calls);
776 722
777 BUG_ON(iocb->ki_pos != pos);
778
779 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
780 if (ret)
781 return ret;
782
783 if (ocount == 0) 723 if (ocount == 0)
784 return 0; 724 return 0;
785 725
786 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 726 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
787 ret = -EIO; 727 return -EIO;
788 goto out;
789 }
790 728
791 if (unlikely(file->f_flags & O_DIRECT)) 729 if (unlikely(file->f_flags & O_DIRECT))
792 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); 730 ret = xfs_file_dio_aio_write(iocb, from);
793 else 731 else
794 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, 732 ret = xfs_file_buffered_aio_write(iocb, from);
795 ocount);
796 733
797 if (ret > 0) { 734 if (ret > 0) {
798 ssize_t err; 735 ssize_t err;
@@ -804,8 +741,6 @@ xfs_file_aio_write(
804 if (err < 0) 741 if (err < 0)
805 ret = err; 742 ret = err;
806 } 743 }
807
808out:
809 return ret; 744 return ret;
810} 745}
811 746
@@ -1461,12 +1396,12 @@ xfs_file_llseek(
1461 1396
1462const struct file_operations xfs_file_operations = { 1397const struct file_operations xfs_file_operations = {
1463 .llseek = xfs_file_llseek, 1398 .llseek = xfs_file_llseek,
1464 .read = do_sync_read, 1399 .read = new_sync_read,
1465 .write = do_sync_write, 1400 .write = new_sync_write,
1466 .aio_read = xfs_file_aio_read, 1401 .read_iter = xfs_file_read_iter,
1467 .aio_write = xfs_file_aio_write, 1402 .write_iter = xfs_file_write_iter,
1468 .splice_read = xfs_file_splice_read, 1403 .splice_read = xfs_file_splice_read,
1469 .splice_write = xfs_file_splice_write, 1404 .splice_write = iter_file_splice_write,
1470 .unlocked_ioctl = xfs_file_ioctl, 1405 .unlocked_ioctl = xfs_file_ioctl,
1471#ifdef CONFIG_COMPAT 1406#ifdef CONFIG_COMPAT
1472 .compat_ioctl = xfs_file_compat_ioctl, 1407 .compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6910458915cf..152f82782630 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1118,7 +1118,6 @@ DEFINE_RW_EVENT(xfs_file_read);
1118DEFINE_RW_EVENT(xfs_file_buffered_write); 1118DEFINE_RW_EVENT(xfs_file_buffered_write);
1119DEFINE_RW_EVENT(xfs_file_direct_write); 1119DEFINE_RW_EVENT(xfs_file_direct_write);
1120DEFINE_RW_EVENT(xfs_file_splice_read); 1120DEFINE_RW_EVENT(xfs_file_splice_read);
1121DEFINE_RW_EVENT(xfs_file_splice_write);
1122 1121
1123DECLARE_EVENT_CLASS(xfs_page_class, 1122DECLARE_EVENT_CLASS(xfs_page_class,
1124 TP_PROTO(struct inode *inode, struct page *page, unsigned long off, 1123 TP_PROTO(struct inode *inode, struct page *page, unsigned long off,