aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-11 14:44:11 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-11 14:44:11 -0400
commitf6f993328b2abcab86a3c99d7bd9f2066ab03d36 (patch)
treeea6f3902a0fa546493731b3b52a31d98cc747a90
parentc7a19c795b4b0a3232c157ed29eea85077e95da6 (diff)
parent12a5b5294cb1896e9a3c9fca8ff5a7e3def4e8c6 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs updates from Al Viro: "Stuff in here: - acct.c fixes and general rework of mnt_pin mechanism. That allows to go for delayed-mntput stuff, which will permit mntput() on deep stack without worrying about stack overflows - fs shutdown will happen on shallow stack. IOW, we can do Eric's umount-on-rmdir series without introducing tons of stack overflows on new mntput() call chains it introduces. - Bruce's d_splice_alias() patches - more Miklos' rename() stuff. - a couple of regression fixes (stable fodder, in the end of branch) and a fix for API idiocy in iov_iter.c. There definitely will be another pile, maybe even two. I'd like to get Eric's series in this time, but even if we miss it, it'll go right in the beginning of for-next in the next cycle - the tricky part of prereqs is in this pile" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (40 commits) fix copy_tree() regression __generic_file_write_iter(): fix handling of sync error after DIO switch iov_iter_get_pages() to passing maximal number of pages fs: mark __d_obtain_alias static dcache: d_splice_alias should detect loops exportfs: update Exporting documentation dcache: d_find_alias needn't recheck IS_ROOT && DCACHE_DISCONNECTED dcache: remove unused d_find_alias parameter dcache: d_obtain_alias callers don't all want DISCONNECTED dcache: d_splice_alias should ignore DCACHE_DISCONNECTED dcache: d_splice_alias mustn't create directory aliases dcache: close d_move race in d_splice_alias dcache: move d_splice_alias namei: trivial fix to vfs_rename_dir comment VFS: allow ->d_manage() to declare -EISDIR in rcu_walk mode. cifs: support RENAME_NOREPLACE hostfs: support rename flags shmem: support RENAME_EXCHANGE shmem: support RENAME_NOREPLACE btrfs: add RENAME_NOREPLACE ...
-rw-r--r--Documentation/filesystems/nfs/Exporting38
-rw-r--r--Documentation/filesystems/vfs.txt3
-rw-r--r--fs/Makefile2
-rw-r--r--fs/bad_inode.c7
-rw-r--r--fs/btrfs/inode.c12
-rw-r--r--fs/btrfs/super.c9
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/inode.c14
-rw-r--r--fs/dcache.c196
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/ext4/namei.c1
-rw-r--r--fs/fs_pin.c78
-rw-r--r--fs/fuse/dir.c7
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/hostfs/hostfs.h1
-rw-r--r--fs/hostfs/hostfs_kern.c30
-rw-r--r--fs/hostfs/hostfs_user.c28
-rw-r--r--fs/internal.h7
-rw-r--r--fs/mount.h2
-rw-r--r--fs/namei.c34
-rw-r--r--fs/namespace.c67
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/super.c19
-rw-r--r--include/linux/acct.h4
-rw-r--r--include/linux/dcache.h1
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/fs_pin.h17
-rw-r--r--include/linux/mount.h4
-rw-r--r--include/linux/uio.h2
-rw-r--r--kernel/acct.c456
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/iov_iter.c17
-rw-r--r--mm/shmem.c32
36 files changed, 645 insertions, 465 deletions
diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index e543b1a619cc..c8f036a9b13f 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -66,23 +66,31 @@ b/ A per-superblock list "s_anon" of dentries which are the roots of
66 66
67c/ Helper routines to allocate anonymous dentries, and to help attach 67c/ Helper routines to allocate anonymous dentries, and to help attach
68 loose directory dentries at lookup time. They are: 68 loose directory dentries at lookup time. They are:
69 d_alloc_anon(inode) will return a dentry for the given inode. 69 d_obtain_alias(inode) will return a dentry for the given inode.
70 If the inode already has a dentry, one of those is returned. 70 If the inode already has a dentry, one of those is returned.
71 If it doesn't, a new anonymous (IS_ROOT and 71 If it doesn't, a new anonymous (IS_ROOT and
72 DCACHE_DISCONNECTED) dentry is allocated and attached. 72 DCACHE_DISCONNECTED) dentry is allocated and attached.
73 In the case of a directory, care is taken that only one dentry 73 In the case of a directory, care is taken that only one dentry
74 can ever be attached. 74 can ever be attached.
75 d_splice_alias(inode, dentry) will make sure that there is a 75 d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode)
76 dentry with the same name and parent as the given dentry, and 76 will introduce a new dentry into the tree; either the passed-in
77 which refers to the given inode. 77 dentry or a preexisting alias for the given inode (such as an
78 If the inode is a directory and already has a dentry, then that 78 anonymous one created by d_obtain_alias), if appropriate. The two
79 dentry is d_moved over the given dentry. 79 functions differ in their handling of directories with preexisting
80 If the passed dentry gets attached, care is taken that this is 80 aliases:
81 mutually exclusive to a d_alloc_anon operation. 81 d_splice_alias will use any existing IS_ROOT dentry, but it will
82 If the passed dentry is used, NULL is returned, else the used 82 return -EIO rather than try to move a dentry with a different
83 dentry is returned. This corresponds to the calling pattern of 83 parent. This is appropriate for local filesystems, which
84 ->lookup. 84 should never see such an alias unless the filesystem is
85 85 corrupted somehow (for example, if two on-disk directory
86 entries refer to the same directory.)
87 d_materialise_unique will attempt to move any dentry. This is
88 appropriate for distributed filesystems, where finding a
89 directory other than where we last cached it may be a normal
90 consequence of concurrent operations on other hosts.
91 Both functions return NULL when the passed-in dentry is used,
92 following the calling convention of ->lookup.
93
86 94
87Filesystem Issues 95Filesystem Issues
88----------------- 96-----------------
@@ -120,12 +128,12 @@ struct which has the following members:
120 128
121 fh_to_dentry (mandatory) 129 fh_to_dentry (mandatory)
122 Given a filehandle fragment, this should find the implied object and 130 Given a filehandle fragment, this should find the implied object and
123 create a dentry for it (possibly with d_alloc_anon). 131 create a dentry for it (possibly with d_obtain_alias).
124 132
125 fh_to_parent (optional but strongly recommended) 133 fh_to_parent (optional but strongly recommended)
126 Given a filehandle fragment, this should find the parent of the 134 Given a filehandle fragment, this should find the parent of the
127 implied object and create a dentry for it (possibly with d_alloc_anon). 135 implied object and create a dentry for it (possibly with
128 May fail if the filehandle fragment is too small. 136 d_obtain_alias). May fail if the filehandle fragment is too small.
129 137
130 get_parent (optional but strongly recommended) 138 get_parent (optional but strongly recommended)
131 When given a dentry for a directory, this should return a dentry for 139 When given a dentry for a directory, this should return a dentry for
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index a1d0d7a30165..61d65cc65c54 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -1053,7 +1053,8 @@ struct dentry_operations {
1053 If the 'rcu_walk' parameter is true, then the caller is doing a 1053 If the 'rcu_walk' parameter is true, then the caller is doing a
1054 pathwalk in RCU-walk mode. Sleeping is not permitted in this mode, 1054 pathwalk in RCU-walk mode. Sleeping is not permitted in this mode,
1055 and the caller can be asked to leave it and call again by returning 1055 and the caller can be asked to leave it and call again by returning
1056 -ECHILD. 1056 -ECHILD. -EISDIR may also be returned to tell pathwalk to
1057 ignore d_automount or any mounts.
1057 1058
1058 This function is only used if DCACHE_MANAGE_TRANSIT is set on the 1059 This function is only used if DCACHE_MANAGE_TRANSIT is set on the
1059 dentry being transited from. 1060 dentry being transited from.
diff --git a/fs/Makefile b/fs/Makefile
index 4030cbfbc9af..90c88529892b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o splice.o sync.o utimes.o \ 13 pnode.o splice.o sync.o utimes.o \
14 stack.o fs_struct.o statfs.o 14 stack.o fs_struct.o statfs.o fs_pin.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o block_dev.o direct-io.o mpage.o 17obj-y += buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 7c93953030fb..afd2b4408adf 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
218 return -EIO; 218 return -EIO;
219} 219}
220 220
221static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, 221static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
222 struct inode *new_dir, struct dentry *new_dentry) 222 struct inode *new_dir, struct dentry *new_dentry,
223 unsigned int flags)
223{ 224{
224 return -EIO; 225 return -EIO;
225} 226}
@@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops =
279 .mkdir = bad_inode_mkdir, 280 .mkdir = bad_inode_mkdir,
280 .rmdir = bad_inode_rmdir, 281 .rmdir = bad_inode_rmdir,
281 .mknod = bad_inode_mknod, 282 .mknod = bad_inode_mknod,
282 .rename = bad_inode_rename, 283 .rename2 = bad_inode_rename2,
283 .readlink = bad_inode_readlink, 284 .readlink = bad_inode_readlink,
284 /* follow_link must be no-op, otherwise unmounting this inode 285 /* follow_link must be no-op, otherwise unmounting this inode
285 won't work */ 286 won't work */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3668048e16f8..3183742d6f0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8476,6 +8476,16 @@ out_notrans:
8476 return ret; 8476 return ret;
8477} 8477}
8478 8478
8479static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
8480 struct inode *new_dir, struct dentry *new_dentry,
8481 unsigned int flags)
8482{
8483 if (flags & ~RENAME_NOREPLACE)
8484 return -EINVAL;
8485
8486 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
8487}
8488
8479static void btrfs_run_delalloc_work(struct btrfs_work *work) 8489static void btrfs_run_delalloc_work(struct btrfs_work *work)
8480{ 8490{
8481 struct btrfs_delalloc_work *delalloc_work; 8491 struct btrfs_delalloc_work *delalloc_work;
@@ -9019,7 +9029,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
9019 .link = btrfs_link, 9029 .link = btrfs_link,
9020 .mkdir = btrfs_mkdir, 9030 .mkdir = btrfs_mkdir,
9021 .rmdir = btrfs_rmdir, 9031 .rmdir = btrfs_rmdir,
9022 .rename = btrfs_rename, 9032 .rename2 = btrfs_rename2,
9023 .symlink = btrfs_symlink, 9033 .symlink = btrfs_symlink,
9024 .setattr = btrfs_setattr, 9034 .setattr = btrfs_setattr,
9025 .mknod = btrfs_mknod, 9035 .mknod = btrfs_mknod,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8e16bca69c56..67b48b9a03e0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
851 struct btrfs_path *path; 851 struct btrfs_path *path;
852 struct btrfs_key location; 852 struct btrfs_key location;
853 struct inode *inode; 853 struct inode *inode;
854 struct dentry *dentry;
855 u64 dir_id; 854 u64 dir_id;
856 int new = 0; 855 int new = 0;
857 856
@@ -922,13 +921,7 @@ setup_root:
922 return dget(sb->s_root); 921 return dget(sb->s_root);
923 } 922 }
924 923
925 dentry = d_obtain_alias(inode); 924 return d_obtain_root(inode);
926 if (!IS_ERR(dentry)) {
927 spin_lock(&dentry->d_lock);
928 dentry->d_flags &= ~DCACHE_DISCONNECTED;
929 spin_unlock(&dentry->d_lock);
930 }
931 return dentry;
932} 925}
933 926
934static int btrfs_fill_super(struct super_block *sb, 927static int btrfs_fill_super(struct super_block *sb,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 06150fd745ac..f6e12377335c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
755 goto out; 755 goto out;
756 } 756 }
757 } else { 757 } else {
758 root = d_obtain_alias(inode); 758 root = d_obtain_root(inode);
759 } 759 }
760 ceph_init_dentry(root); 760 ceph_init_dentry(root);
761 dout("open_root_inode success, root dentry is %p\n", root); 761 dout("open_root_inode success, root dentry is %p\n", root);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 888398067420..ac4f260155c8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -848,7 +848,7 @@ const struct inode_operations cifs_dir_inode_ops = {
848 .link = cifs_hardlink, 848 .link = cifs_hardlink,
849 .mkdir = cifs_mkdir, 849 .mkdir = cifs_mkdir,
850 .rmdir = cifs_rmdir, 850 .rmdir = cifs_rmdir,
851 .rename = cifs_rename, 851 .rename2 = cifs_rename2,
852 .permission = cifs_permission, 852 .permission = cifs_permission,
853/* revalidate:cifs_revalidate, */ 853/* revalidate:cifs_revalidate, */
854 .setattr = cifs_setattr, 854 .setattr = cifs_setattr,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 560480263336..b0fafa499505 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
68extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); 68extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
69extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); 69extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
70extern int cifs_rmdir(struct inode *, struct dentry *); 70extern int cifs_rmdir(struct inode *, struct dentry *);
71extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 71extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
72 struct dentry *); 72 struct dentry *, unsigned int);
73extern int cifs_revalidate_file_attr(struct file *filp); 73extern int cifs_revalidate_file_attr(struct file *filp);
74extern int cifs_revalidate_dentry_attr(struct dentry *); 74extern int cifs_revalidate_dentry_attr(struct dentry *);
75extern int cifs_revalidate_file(struct file *filp); 75extern int cifs_revalidate_file(struct file *filp);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 41de3935caa0..426d6c6ad8bf 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1627,8 +1627,9 @@ do_rename_exit:
1627} 1627}
1628 1628
1629int 1629int
1630cifs_rename(struct inode *source_dir, struct dentry *source_dentry, 1630cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
1631 struct inode *target_dir, struct dentry *target_dentry) 1631 struct inode *target_dir, struct dentry *target_dentry,
1632 unsigned int flags)
1632{ 1633{
1633 char *from_name = NULL; 1634 char *from_name = NULL;
1634 char *to_name = NULL; 1635 char *to_name = NULL;
@@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1640 unsigned int xid; 1641 unsigned int xid;
1641 int rc, tmprc; 1642 int rc, tmprc;
1642 1643
1644 if (flags & ~RENAME_NOREPLACE)
1645 return -EINVAL;
1646
1643 cifs_sb = CIFS_SB(source_dir->i_sb); 1647 cifs_sb = CIFS_SB(source_dir->i_sb);
1644 tlink = cifs_sb_tlink(cifs_sb); 1648 tlink = cifs_sb_tlink(cifs_sb);
1645 if (IS_ERR(tlink)) 1649 if (IS_ERR(tlink))
@@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1667 rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, 1671 rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
1668 to_name); 1672 to_name);
1669 1673
1674 /*
1675 * No-replace is the natural behavior for CIFS, so skip unlink hacks.
1676 */
1677 if (flags & RENAME_NOREPLACE)
1678 goto cifs_rename_exit;
1679
1670 if (rc == -EEXIST && tcon->unix_ext) { 1680 if (rc == -EEXIST && tcon->unix_ext) {
1671 /* 1681 /*
1672 * Are src and dst hardlinks of same inode? We can only tell 1682 * Are src and dst hardlinks of same inode? We can only tell
diff --git a/fs/dcache.c b/fs/dcache.c
index 06f65857a855..d30ce699ae4b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -731,8 +731,6 @@ EXPORT_SYMBOL(dget_parent);
731/** 731/**
732 * d_find_alias - grab a hashed alias of inode 732 * d_find_alias - grab a hashed alias of inode
733 * @inode: inode in question 733 * @inode: inode in question
734 * @want_discon: flag, used by d_splice_alias, to request
735 * that only a DISCONNECTED alias be returned.
736 * 734 *
737 * If inode has a hashed alias, or is a directory and has any alias, 735 * If inode has a hashed alias, or is a directory and has any alias,
738 * acquire the reference to alias and return it. Otherwise return NULL. 736 * acquire the reference to alias and return it. Otherwise return NULL.
@@ -741,10 +739,9 @@ EXPORT_SYMBOL(dget_parent);
741 * of a filesystem. 739 * of a filesystem.
742 * 740 *
743 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer 741 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
744 * any other hashed alias over that one unless @want_discon is set, 742 * any other hashed alias over that one.
745 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
746 */ 743 */
747static struct dentry *__d_find_alias(struct inode *inode, int want_discon) 744static struct dentry *__d_find_alias(struct inode *inode)
748{ 745{
749 struct dentry *alias, *discon_alias; 746 struct dentry *alias, *discon_alias;
750 747
@@ -756,7 +753,7 @@ again:
756 if (IS_ROOT(alias) && 753 if (IS_ROOT(alias) &&
757 (alias->d_flags & DCACHE_DISCONNECTED)) { 754 (alias->d_flags & DCACHE_DISCONNECTED)) {
758 discon_alias = alias; 755 discon_alias = alias;
759 } else if (!want_discon) { 756 } else {
760 __dget_dlock(alias); 757 __dget_dlock(alias);
761 spin_unlock(&alias->d_lock); 758 spin_unlock(&alias->d_lock);
762 return alias; 759 return alias;
@@ -768,12 +765,9 @@ again:
768 alias = discon_alias; 765 alias = discon_alias;
769 spin_lock(&alias->d_lock); 766 spin_lock(&alias->d_lock);
770 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { 767 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
771 if (IS_ROOT(alias) && 768 __dget_dlock(alias);
772 (alias->d_flags & DCACHE_DISCONNECTED)) { 769 spin_unlock(&alias->d_lock);
773 __dget_dlock(alias); 770 return alias;
774 spin_unlock(&alias->d_lock);
775 return alias;
776 }
777 } 771 }
778 spin_unlock(&alias->d_lock); 772 spin_unlock(&alias->d_lock);
779 goto again; 773 goto again;
@@ -787,7 +781,7 @@ struct dentry *d_find_alias(struct inode *inode)
787 781
788 if (!hlist_empty(&inode->i_dentry)) { 782 if (!hlist_empty(&inode->i_dentry)) {
789 spin_lock(&inode->i_lock); 783 spin_lock(&inode->i_lock);
790 de = __d_find_alias(inode, 0); 784 de = __d_find_alias(inode);
791 spin_unlock(&inode->i_lock); 785 spin_unlock(&inode->i_lock);
792 } 786 }
793 return de; 787 return de;
@@ -1781,25 +1775,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
1781} 1775}
1782EXPORT_SYMBOL(d_find_any_alias); 1776EXPORT_SYMBOL(d_find_any_alias);
1783 1777
1784/** 1778static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
1785 * d_obtain_alias - find or allocate a dentry for a given inode
1786 * @inode: inode to allocate the dentry for
1787 *
1788 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
1789 * similar open by handle operations. The returned dentry may be anonymous,
1790 * or may have a full name (if the inode was already in the cache).
1791 *
1792 * When called on a directory inode, we must ensure that the inode only ever
1793 * has one dentry. If a dentry is found, that is returned instead of
1794 * allocating a new one.
1795 *
1796 * On successful return, the reference to the inode has been transferred
1797 * to the dentry. In case of an error the reference on the inode is released.
1798 * To make it easier to use in export operations a %NULL or IS_ERR inode may
1799 * be passed in and will be the error will be propagate to the return value,
1800 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
1801 */
1802struct dentry *d_obtain_alias(struct inode *inode)
1803{ 1779{
1804 static const struct qstr anonstring = QSTR_INIT("/", 1); 1780 static const struct qstr anonstring = QSTR_INIT("/", 1);
1805 struct dentry *tmp; 1781 struct dentry *tmp;
@@ -1830,7 +1806,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
1830 } 1806 }
1831 1807
1832 /* attach a disconnected dentry */ 1808 /* attach a disconnected dentry */
1833 add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED; 1809 add_flags = d_flags_for_inode(inode);
1810
1811 if (disconnected)
1812 add_flags |= DCACHE_DISCONNECTED;
1834 1813
1835 spin_lock(&tmp->d_lock); 1814 spin_lock(&tmp->d_lock);
1836 tmp->d_inode = inode; 1815 tmp->d_inode = inode;
@@ -1851,59 +1830,51 @@ struct dentry *d_obtain_alias(struct inode *inode)
1851 iput(inode); 1830 iput(inode);
1852 return res; 1831 return res;
1853} 1832}
1854EXPORT_SYMBOL(d_obtain_alias);
1855 1833
1856/** 1834/**
1857 * d_splice_alias - splice a disconnected dentry into the tree if one exists 1835 * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
1858 * @inode: the inode which may have a disconnected dentry 1836 * @inode: inode to allocate the dentry for
1859 * @dentry: a negative dentry which we want to point to the inode.
1860 *
1861 * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
1862 * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
1863 * and return it, else simply d_add the inode to the dentry and return NULL.
1864 * 1837 *
1865 * This is needed in the lookup routine of any filesystem that is exportable 1838 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
1866 * (via knfsd) so that we can build dcache paths to directories effectively. 1839 * similar open by handle operations. The returned dentry may be anonymous,
1840 * or may have a full name (if the inode was already in the cache).
1867 * 1841 *
1868 * If a dentry was found and moved, then it is returned. Otherwise NULL 1842 * When called on a directory inode, we must ensure that the inode only ever
1869 * is returned. This matches the expected return value of ->lookup. 1843 * has one dentry. If a dentry is found, that is returned instead of
1844 * allocating a new one.
1870 * 1845 *
1871 * Cluster filesystems may call this function with a negative, hashed dentry. 1846 * On successful return, the reference to the inode has been transferred
1872 * In that case, we know that the inode will be a regular file, and also this 1847 * to the dentry. In case of an error the reference on the inode is released.
1873 * will only occur during atomic_open. So we need to check for the dentry 1848 * To make it easier to use in export operations a %NULL or IS_ERR inode may
1874 * being already hashed only in the final case. 1849 * be passed in and the error will be propagated to the return value,
1850 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
1875 */ 1851 */
1876struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) 1852struct dentry *d_obtain_alias(struct inode *inode)
1877{ 1853{
1878 struct dentry *new = NULL; 1854 return __d_obtain_alias(inode, 1);
1879 1855}
1880 if (IS_ERR(inode)) 1856EXPORT_SYMBOL(d_obtain_alias);
1881 return ERR_CAST(inode);
1882 1857
1883 if (inode && S_ISDIR(inode->i_mode)) { 1858/**
1884 spin_lock(&inode->i_lock); 1859 * d_obtain_root - find or allocate a dentry for a given inode
1885 new = __d_find_alias(inode, 1); 1860 * @inode: inode to allocate the dentry for
1886 if (new) { 1861 *
1887 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 1862 * Obtain an IS_ROOT dentry for the root of a filesystem.
1888 spin_unlock(&inode->i_lock); 1863 *
1889 security_d_instantiate(new, inode); 1864 * We must ensure that directory inodes only ever have one dentry. If a
1890 d_move(new, dentry); 1865 * dentry is found, that is returned instead of allocating a new one.
1891 iput(inode); 1866 *
1892 } else { 1867 * On successful return, the reference to the inode has been transferred
1893 /* already taking inode->i_lock, so d_add() by hand */ 1868 * to the dentry. In case of an error the reference on the inode is
1894 __d_instantiate(dentry, inode); 1869 * released. A %NULL or IS_ERR inode may be passed in and will be the
1895 spin_unlock(&inode->i_lock); 1870 * error will be propagate to the return value, with a %NULL @inode
1896 security_d_instantiate(dentry, inode); 1871 * replaced by ERR_PTR(-ESTALE).
1897 d_rehash(dentry); 1872 */
1898 } 1873struct dentry *d_obtain_root(struct inode *inode)
1899 } else { 1874{
1900 d_instantiate(dentry, inode); 1875 return __d_obtain_alias(inode, 0);
1901 if (d_unhashed(dentry))
1902 d_rehash(dentry);
1903 }
1904 return new;
1905} 1876}
1906EXPORT_SYMBOL(d_splice_alias); 1877EXPORT_SYMBOL(d_obtain_root);
1907 1878
1908/** 1879/**
1909 * d_add_ci - lookup or allocate new dentry with case-exact name 1880 * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -2697,6 +2668,75 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2697} 2668}
2698 2669
2699/** 2670/**
2671 * d_splice_alias - splice a disconnected dentry into the tree if one exists
2672 * @inode: the inode which may have a disconnected dentry
2673 * @dentry: a negative dentry which we want to point to the inode.
2674 *
2675 * If inode is a directory and has an IS_ROOT alias, then d_move that in
2676 * place of the given dentry and return it, else simply d_add the inode
2677 * to the dentry and return NULL.
2678 *
2679 * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
2680 * we should error out: directories can't have multiple aliases.
2681 *
2682 * This is needed in the lookup routine of any filesystem that is exportable
2683 * (via knfsd) so that we can build dcache paths to directories effectively.
2684 *
2685 * If a dentry was found and moved, then it is returned. Otherwise NULL
2686 * is returned. This matches the expected return value of ->lookup.
2687 *
2688 * Cluster filesystems may call this function with a negative, hashed dentry.
2689 * In that case, we know that the inode will be a regular file, and also this
2690 * will only occur during atomic_open. So we need to check for the dentry
2691 * being already hashed only in the final case.
2692 */
2693struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
2694{
2695 struct dentry *new = NULL;
2696
2697 if (IS_ERR(inode))
2698 return ERR_CAST(inode);
2699
2700 if (inode && S_ISDIR(inode->i_mode)) {
2701 spin_lock(&inode->i_lock);
2702 new = __d_find_any_alias(inode);
2703 if (new) {
2704 if (!IS_ROOT(new)) {
2705 spin_unlock(&inode->i_lock);
2706 dput(new);
2707 return ERR_PTR(-EIO);
2708 }
2709 if (d_ancestor(new, dentry)) {
2710 spin_unlock(&inode->i_lock);
2711 dput(new);
2712 return ERR_PTR(-EIO);
2713 }
2714 write_seqlock(&rename_lock);
2715 __d_materialise_dentry(dentry, new);
2716 write_sequnlock(&rename_lock);
2717 __d_drop(new);
2718 _d_rehash(new);
2719 spin_unlock(&new->d_lock);
2720 spin_unlock(&inode->i_lock);
2721 security_d_instantiate(new, inode);
2722 iput(inode);
2723 } else {
2724 /* already taking inode->i_lock, so d_add() by hand */
2725 __d_instantiate(dentry, inode);
2726 spin_unlock(&inode->i_lock);
2727 security_d_instantiate(dentry, inode);
2728 d_rehash(dentry);
2729 }
2730 } else {
2731 d_instantiate(dentry, inode);
2732 if (d_unhashed(dentry))
2733 d_rehash(dentry);
2734 }
2735 return new;
2736}
2737EXPORT_SYMBOL(d_splice_alias);
2738
2739/**
2700 * d_materialise_unique - introduce an inode into the tree 2740 * d_materialise_unique - introduce an inode into the tree
2701 * @dentry: candidate dentry 2741 * @dentry: candidate dentry
2702 * @inode: inode to bind to the dentry, to which aliases may be attached 2742 * @inode: inode to bind to the dentry, to which aliases may be attached
@@ -2724,7 +2764,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2724 struct dentry *alias; 2764 struct dentry *alias;
2725 2765
2726 /* Does an aliased dentry already exist? */ 2766 /* Does an aliased dentry already exist? */
2727 alias = __d_find_alias(inode, 0); 2767 alias = __d_find_alias(inode);
2728 if (alias) { 2768 if (alias) {
2729 actual = alias; 2769 actual = alias;
2730 write_seqlock(&rename_lock); 2770 write_seqlock(&rename_lock);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 17e39b047de5..c3116404ab49 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
158{ 158{
159 ssize_t ret; 159 ssize_t ret;
160 160
161 ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE, 161 ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
162 &sdio->from); 162 &sdio->from);
163 163
164 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { 164 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3520ab8a6639..b147a67baa0d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3455,7 +3455,6 @@ const struct inode_operations ext4_dir_inode_operations = {
3455 .rmdir = ext4_rmdir, 3455 .rmdir = ext4_rmdir,
3456 .mknod = ext4_mknod, 3456 .mknod = ext4_mknod,
3457 .tmpfile = ext4_tmpfile, 3457 .tmpfile = ext4_tmpfile,
3458 .rename = ext4_rename,
3459 .rename2 = ext4_rename2, 3458 .rename2 = ext4_rename2,
3460 .setattr = ext4_setattr, 3459 .setattr = ext4_setattr,
3461 .setxattr = generic_setxattr, 3460 .setxattr = generic_setxattr,
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
new file mode 100644
index 000000000000..9368236ca100
--- /dev/null
+++ b/fs/fs_pin.c
@@ -0,0 +1,78 @@
1#include <linux/fs.h>
2#include <linux/slab.h>
3#include <linux/fs_pin.h>
4#include "internal.h"
5#include "mount.h"
6
7static void pin_free_rcu(struct rcu_head *head)
8{
9 kfree(container_of(head, struct fs_pin, rcu));
10}
11
12static DEFINE_SPINLOCK(pin_lock);
13
14void pin_put(struct fs_pin *p)
15{
16 if (atomic_long_dec_and_test(&p->count))
17 call_rcu(&p->rcu, pin_free_rcu);
18}
19
20void pin_remove(struct fs_pin *pin)
21{
22 spin_lock(&pin_lock);
23 hlist_del(&pin->m_list);
24 hlist_del(&pin->s_list);
25 spin_unlock(&pin_lock);
26}
27
28void pin_insert(struct fs_pin *pin, struct vfsmount *m)
29{
30 spin_lock(&pin_lock);
31 hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
32 hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
33 spin_unlock(&pin_lock);
34}
35
36void mnt_pin_kill(struct mount *m)
37{
38 while (1) {
39 struct hlist_node *p;
40 struct fs_pin *pin;
41 rcu_read_lock();
42 p = ACCESS_ONCE(m->mnt_pins.first);
43 if (!p) {
44 rcu_read_unlock();
45 break;
46 }
47 pin = hlist_entry(p, struct fs_pin, m_list);
48 if (!atomic_long_inc_not_zero(&pin->count)) {
49 rcu_read_unlock();
50 cpu_relax();
51 continue;
52 }
53 rcu_read_unlock();
54 pin->kill(pin);
55 }
56}
57
58void sb_pin_kill(struct super_block *sb)
59{
60 while (1) {
61 struct hlist_node *p;
62 struct fs_pin *pin;
63 rcu_read_lock();
64 p = ACCESS_ONCE(sb->s_pins.first);
65 if (!p) {
66 rcu_read_unlock();
67 break;
68 }
69 pin = hlist_entry(p, struct fs_pin, s_list);
70 if (!atomic_long_inc_not_zero(&pin->count)) {
71 rcu_read_unlock();
72 cpu_relax();
73 continue;
74 }
75 rcu_read_unlock();
76 pin->kill(pin);
77 }
78}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0c6048247a34..de1d84af9f7c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -845,12 +845,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
845 return err; 845 return err;
846} 846}
847 847
848static int fuse_rename(struct inode *olddir, struct dentry *oldent,
849 struct inode *newdir, struct dentry *newent)
850{
851 return fuse_rename2(olddir, oldent, newdir, newent, 0);
852}
853
854static int fuse_link(struct dentry *entry, struct inode *newdir, 848static int fuse_link(struct dentry *entry, struct inode *newdir,
855 struct dentry *newent) 849 struct dentry *newent)
856{ 850{
@@ -2024,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = {
2024 .symlink = fuse_symlink, 2018 .symlink = fuse_symlink,
2025 .unlink = fuse_unlink, 2019 .unlink = fuse_unlink,
2026 .rmdir = fuse_rmdir, 2020 .rmdir = fuse_rmdir,
2027 .rename = fuse_rename,
2028 .rename2 = fuse_rename2, 2021 .rename2 = fuse_rename2,
2029 .link = fuse_link, 2022 .link = fuse_link,
2030 .setattr = fuse_setattr, 2023 .setattr = fuse_setattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 40ac2628ddcf..912061ac4baf 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1303,10 +1303,10 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1303 while (nbytes < *nbytesp && req->num_pages < req->max_pages) { 1303 while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
1304 unsigned npages; 1304 unsigned npages;
1305 size_t start; 1305 size_t start;
1306 unsigned n = req->max_pages - req->num_pages;
1307 ssize_t ret = iov_iter_get_pages(ii, 1306 ssize_t ret = iov_iter_get_pages(ii,
1308 &req->pages[req->num_pages], 1307 &req->pages[req->num_pages],
1309 n * PAGE_SIZE, &start); 1308 req->max_pages - req->num_pages,
1309 &start);
1310 if (ret < 0) 1310 if (ret < 0)
1311 return ret; 1311 return ret;
1312 1312
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 9c88da0e855a..4fcd40d6f308 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major,
89extern int link_file(const char *from, const char *to); 89extern int link_file(const char *from, const char *to);
90extern int hostfs_do_readlink(char *file, char *buf, int size); 90extern int hostfs_do_readlink(char *file, char *buf, int size);
91extern int rename_file(char *from, char *to); 91extern int rename_file(char *from, char *to);
92extern int rename2_file(char *from, char *to, unsigned int flags);
92extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 93extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
93 long long *bfree_out, long long *bavail_out, 94 long long *bfree_out, long long *bavail_out,
94 long long *files_out, long long *ffree_out, 95 long long *files_out, long long *ffree_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bb529f3b7f2b..fd62cae0fdcb 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
741 return err; 741 return err;
742} 742}
743 743
744static int hostfs_rename(struct inode *from_ino, struct dentry *from, 744static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
745 struct inode *to_ino, struct dentry *to) 745 struct inode *new_dir, struct dentry *new_dentry,
746 unsigned int flags)
746{ 747{
747 char *from_name, *to_name; 748 char *old_name, *new_name;
748 int err; 749 int err;
749 750
750 if ((from_name = dentry_name(from)) == NULL) 751 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
752 return -EINVAL;
753
754 old_name = dentry_name(old_dentry);
755 if (old_name == NULL)
751 return -ENOMEM; 756 return -ENOMEM;
752 if ((to_name = dentry_name(to)) == NULL) { 757 new_name = dentry_name(new_dentry);
753 __putname(from_name); 758 if (new_name == NULL) {
759 __putname(old_name);
754 return -ENOMEM; 760 return -ENOMEM;
755 } 761 }
756 err = rename_file(from_name, to_name); 762 if (!flags)
757 __putname(from_name); 763 err = rename_file(old_name, new_name);
758 __putname(to_name); 764 else
765 err = rename2_file(old_name, new_name, flags);
766
767 __putname(old_name);
768 __putname(new_name);
759 return err; 769 return err;
760} 770}
761 771
@@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = {
867 .mkdir = hostfs_mkdir, 877 .mkdir = hostfs_mkdir,
868 .rmdir = hostfs_rmdir, 878 .rmdir = hostfs_rmdir,
869 .mknod = hostfs_mknod, 879 .mknod = hostfs_mknod,
870 .rename = hostfs_rename, 880 .rename2 = hostfs_rename2,
871 .permission = hostfs_permission, 881 .permission = hostfs_permission,
872 .setattr = hostfs_setattr, 882 .setattr = hostfs_setattr,
873}; 883};
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 67838f3aa20a..9765dab95cbd 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -14,6 +14,7 @@
14#include <sys/time.h> 14#include <sys/time.h>
15#include <sys/types.h> 15#include <sys/types.h>
16#include <sys/vfs.h> 16#include <sys/vfs.h>
17#include <sys/syscall.h>
17#include "hostfs.h" 18#include "hostfs.h"
18#include <utime.h> 19#include <utime.h>
19 20
@@ -360,6 +361,33 @@ int rename_file(char *from, char *to)
360 return 0; 361 return 0;
361} 362}
362 363
364int rename2_file(char *from, char *to, unsigned int flags)
365{
366 int err;
367
368#ifndef SYS_renameat2
369# ifdef __x86_64__
370# define SYS_renameat2 316
371# endif
372# ifdef __i386__
373# define SYS_renameat2 353
374# endif
375#endif
376
377#ifdef SYS_renameat2
378 err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
379 if (err < 0) {
380 if (errno != ENOSYS)
381 return -errno;
382 else
383 return -EINVAL;
384 }
385 return 0;
386#else
387 return -EINVAL;
388#endif
389}
390
363int do_statfs(char *root, long *bsize_out, long long *blocks_out, 391int do_statfs(char *root, long *bsize_out, long long *blocks_out,
364 long long *bfree_out, long long *bavail_out, 392 long long *bfree_out, long long *bavail_out,
365 long long *files_out, long long *ffree_out, 393 long long *files_out, long long *ffree_out,
diff --git a/fs/internal.h b/fs/internal.h
index 465742407466..e325b4f9c799 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
131/* 131/*
132 * read_write.c 132 * read_write.c
133 */ 133 */
134extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
135extern int rw_verify_area(int, struct file *, const loff_t *, size_t); 134extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
136 135
137/* 136/*
@@ -144,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
144 * pipe.c 143 * pipe.c
145 */ 144 */
146extern const struct file_operations pipefifo_fops; 145extern const struct file_operations pipefifo_fops;
146
147/*
148 * fs_pin.c
149 */
150extern void sb_pin_kill(struct super_block *sb);
151extern void mnt_pin_kill(struct mount *m);
diff --git a/fs/mount.h b/fs/mount.h
index d55297f2fa05..6740a6215529 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -55,7 +55,7 @@ struct mount {
55 int mnt_id; /* mount identifier */ 55 int mnt_id; /* mount identifier */
56 int mnt_group_id; /* peer group identifier */ 56 int mnt_group_id; /* peer group identifier */
57 int mnt_expiry_mark; /* true if marked for expiry */ 57 int mnt_expiry_mark; /* true if marked for expiry */
58 int mnt_pinned; 58 struct hlist_head mnt_pins;
59 struct path mnt_ex_mountpoint; 59 struct path mnt_ex_mountpoint;
60}; 60};
61 61
diff --git a/fs/namei.c b/fs/namei.c
index 9eb787e5c167..a996bb48dfab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path)
1091} 1091}
1092EXPORT_SYMBOL(follow_down_one); 1092EXPORT_SYMBOL(follow_down_one);
1093 1093
1094static inline bool managed_dentry_might_block(struct dentry *dentry) 1094static inline int managed_dentry_rcu(struct dentry *dentry)
1095{ 1095{
1096 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && 1096 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
1097 dentry->d_op->d_manage(dentry, true) < 0); 1097 dentry->d_op->d_manage(dentry, true) : 0;
1098} 1098}
1099 1099
1100/* 1100/*
@@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1110 * Don't forget we might have a non-mountpoint managed dentry 1110 * Don't forget we might have a non-mountpoint managed dentry
1111 * that wants to block transit. 1111 * that wants to block transit.
1112 */ 1112 */
1113 if (unlikely(managed_dentry_might_block(path->dentry))) 1113 switch (managed_dentry_rcu(path->dentry)) {
1114 case -ECHILD:
1115 default:
1114 return false; 1116 return false;
1117 case -EISDIR:
1118 return true;
1119 case 0:
1120 break;
1121 }
1115 1122
1116 if (!d_mountpoint(path->dentry)) 1123 if (!d_mountpoint(path->dentry))
1117 return true; 1124 return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1118 1125
1119 mounted = __lookup_mnt(path->mnt, path->dentry); 1126 mounted = __lookup_mnt(path->mnt, path->dentry);
1120 if (!mounted) 1127 if (!mounted)
@@ -1130,7 +1137,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1130 */ 1137 */
1131 *inode = path->dentry->d_inode; 1138 *inode = path->dentry->d_inode;
1132 } 1139 }
1133 return read_seqretry(&mount_lock, nd->m_seq); 1140 return read_seqretry(&mount_lock, nd->m_seq) &&
1141 !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1134} 1142}
1135 1143
1136static int follow_dotdot_rcu(struct nameidata *nd) 1144static int follow_dotdot_rcu(struct nameidata *nd)
@@ -1402,11 +1410,8 @@ static int lookup_fast(struct nameidata *nd,
1402 } 1410 }
1403 path->mnt = mnt; 1411 path->mnt = mnt;
1404 path->dentry = dentry; 1412 path->dentry = dentry;
1405 if (unlikely(!__follow_mount_rcu(nd, path, inode))) 1413 if (likely(__follow_mount_rcu(nd, path, inode)))
1406 goto unlazy; 1414 return 0;
1407 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1408 goto unlazy;
1409 return 0;
1410unlazy: 1415unlazy:
1411 if (unlazy_walk(nd, dentry)) 1416 if (unlazy_walk(nd, dentry))
1412 return -ECHILD; 1417 return -ECHILD;
@@ -4019,7 +4024,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
4019 * The worst of all namespace operations - renaming directory. "Perverted" 4024 * The worst of all namespace operations - renaming directory. "Perverted"
4020 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 4025 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4021 * Problems: 4026 * Problems:
4022 * a) we can get into loop creation. Check is done in is_subdir(). 4027 * a) we can get into loop creation.
4023 * b) race potential - two innocent renames can create a loop together. 4028 * b) race potential - two innocent renames can create a loop together.
4024 * That's where 4.4 screws up. Current fix: serialization on 4029 * That's where 4.4 screws up. Current fix: serialization on
4025 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another 4030 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another
@@ -4075,7 +4080,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4075 if (error) 4080 if (error)
4076 return error; 4081 return error;
4077 4082
4078 if (!old_dir->i_op->rename) 4083 if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
4079 return -EPERM; 4084 return -EPERM;
4080 4085
4081 if (flags && !old_dir->i_op->rename2) 4086 if (flags && !old_dir->i_op->rename2)
@@ -4134,10 +4139,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4134 if (error) 4139 if (error)
4135 goto out; 4140 goto out;
4136 } 4141 }
4137 if (!flags) { 4142 if (!old_dir->i_op->rename2) {
4138 error = old_dir->i_op->rename(old_dir, old_dentry, 4143 error = old_dir->i_op->rename(old_dir, old_dentry,
4139 new_dir, new_dentry); 4144 new_dir, new_dentry);
4140 } else { 4145 } else {
4146 WARN_ON(old_dir->i_op->rename != NULL);
4141 error = old_dir->i_op->rename2(old_dir, old_dentry, 4147 error = old_dir->i_op->rename2(old_dir, old_dentry,
4142 new_dir, new_dentry, flags); 4148 new_dir, new_dentry, flags);
4143 } 4149 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 0acabea58319..a01c7730e9af 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -16,7 +16,6 @@
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/security.h> 17#include <linux/security.h>
18#include <linux/idr.h> 18#include <linux/idr.h>
19#include <linux/acct.h> /* acct_auto_close_mnt */
20#include <linux/init.h> /* init_rootfs */ 19#include <linux/init.h> /* init_rootfs */
21#include <linux/fs_struct.h> /* get_fs_root et.al. */ 20#include <linux/fs_struct.h> /* get_fs_root et.al. */
22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 21#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
@@ -779,6 +778,20 @@ static void attach_mnt(struct mount *mnt,
779 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 778 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
780} 779}
781 780
781static void attach_shadowed(struct mount *mnt,
782 struct mount *parent,
783 struct mount *shadows)
784{
785 if (shadows) {
786 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
787 list_add(&mnt->mnt_child, &shadows->mnt_child);
788 } else {
789 hlist_add_head_rcu(&mnt->mnt_hash,
790 m_hash(&parent->mnt, mnt->mnt_mountpoint));
791 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
792 }
793}
794
782/* 795/*
783 * vfsmount lock must be held for write 796 * vfsmount lock must be held for write
784 */ 797 */
@@ -797,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
797 810
798 list_splice(&head, n->list.prev); 811 list_splice(&head, n->list.prev);
799 812
800 if (shadows) 813 attach_shadowed(mnt, parent, shadows);
801 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
802 else
803 hlist_add_head_rcu(&mnt->mnt_hash,
804 m_hash(&parent->mnt, mnt->mnt_mountpoint));
805 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
806 touch_mnt_namespace(n); 814 touch_mnt_namespace(n);
807} 815}
808 816
@@ -951,7 +959,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
951 959
952static void mntput_no_expire(struct mount *mnt) 960static void mntput_no_expire(struct mount *mnt)
953{ 961{
954put_again:
955 rcu_read_lock(); 962 rcu_read_lock();
956 mnt_add_count(mnt, -1); 963 mnt_add_count(mnt, -1);
957 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ 964 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@ -964,14 +971,6 @@ put_again:
964 unlock_mount_hash(); 971 unlock_mount_hash();
965 return; 972 return;
966 } 973 }
967 if (unlikely(mnt->mnt_pinned)) {
968 mnt_add_count(mnt, mnt->mnt_pinned + 1);
969 mnt->mnt_pinned = 0;
970 rcu_read_unlock();
971 unlock_mount_hash();
972 acct_auto_close_mnt(&mnt->mnt);
973 goto put_again;
974 }
975 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { 974 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
976 rcu_read_unlock(); 975 rcu_read_unlock();
977 unlock_mount_hash(); 976 unlock_mount_hash();
@@ -994,6 +993,8 @@ put_again:
994 * so mnt_get_writers() below is safe. 993 * so mnt_get_writers() below is safe.
995 */ 994 */
996 WARN_ON(mnt_get_writers(mnt)); 995 WARN_ON(mnt_get_writers(mnt));
996 if (unlikely(mnt->mnt_pins.first))
997 mnt_pin_kill(mnt);
997 fsnotify_vfsmount_delete(&mnt->mnt); 998 fsnotify_vfsmount_delete(&mnt->mnt);
998 dput(mnt->mnt.mnt_root); 999 dput(mnt->mnt.mnt_root);
999 deactivate_super(mnt->mnt.mnt_sb); 1000 deactivate_super(mnt->mnt.mnt_sb);
@@ -1021,25 +1022,15 @@ struct vfsmount *mntget(struct vfsmount *mnt)
1021} 1022}
1022EXPORT_SYMBOL(mntget); 1023EXPORT_SYMBOL(mntget);
1023 1024
1024void mnt_pin(struct vfsmount *mnt) 1025struct vfsmount *mnt_clone_internal(struct path *path)
1025{
1026 lock_mount_hash();
1027 real_mount(mnt)->mnt_pinned++;
1028 unlock_mount_hash();
1029}
1030EXPORT_SYMBOL(mnt_pin);
1031
1032void mnt_unpin(struct vfsmount *m)
1033{ 1026{
1034 struct mount *mnt = real_mount(m); 1027 struct mount *p;
1035 lock_mount_hash(); 1028 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1036 if (mnt->mnt_pinned) { 1029 if (IS_ERR(p))
1037 mnt_add_count(mnt, 1); 1030 return ERR_CAST(p);
1038 mnt->mnt_pinned--; 1031 p->mnt.mnt_flags |= MNT_INTERNAL;
1039 } 1032 return &p->mnt;
1040 unlock_mount_hash();
1041} 1033}
1042EXPORT_SYMBOL(mnt_unpin);
1043 1034
1044static inline void mangle(struct seq_file *m, const char *s) 1035static inline void mangle(struct seq_file *m, const char *s)
1045{ 1036{
@@ -1505,6 +1496,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1505 continue; 1496 continue;
1506 1497
1507 for (s = r; s; s = next_mnt(s, r)) { 1498 for (s = r; s; s = next_mnt(s, r)) {
1499 struct mount *t = NULL;
1508 if (!(flag & CL_COPY_UNBINDABLE) && 1500 if (!(flag & CL_COPY_UNBINDABLE) &&
1509 IS_MNT_UNBINDABLE(s)) { 1501 IS_MNT_UNBINDABLE(s)) {
1510 s = skip_mnt_tree(s); 1502 s = skip_mnt_tree(s);
@@ -1526,7 +1518,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1526 goto out; 1518 goto out;
1527 lock_mount_hash(); 1519 lock_mount_hash();
1528 list_add_tail(&q->mnt_list, &res->mnt_list); 1520 list_add_tail(&q->mnt_list, &res->mnt_list);
1529 attach_mnt(q, parent, p->mnt_mp); 1521 mnt_set_mountpoint(parent, p->mnt_mp, q);
1522 if (!list_empty(&parent->mnt_mounts)) {
1523 t = list_last_entry(&parent->mnt_mounts,
1524 struct mount, mnt_child);
1525 if (t->mnt_mp != p->mnt_mp)
1526 t = NULL;
1527 }
1528 attach_shadowed(q, parent, t);
1530 unlock_mount_hash(); 1529 unlock_mount_hash();
1531 } 1530 }
1532 } 1531 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b94f80420a58..880618a8b048 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
112 * if the dentry tree reaches them; however if the dentry already 112 * if the dentry tree reaches them; however if the dentry already
113 * exists, we'll pick it up at this point and use it as the root 113 * exists, we'll pick it up at this point and use it as the root
114 */ 114 */
115 ret = d_obtain_alias(inode); 115 ret = d_obtain_root(inode);
116 if (IS_ERR(ret)) { 116 if (IS_ERR(ret)) {
117 dprintk("nfs_get_root: get root dentry failed\n"); 117 dprintk("nfs_get_root: get root dentry failed\n");
118 goto out; 118 goto out;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c519927b7b5e..228f5bdf0772 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
942 iput(inode); 942 iput(inode);
943 } 943 }
944 } else { 944 } else {
945 dentry = d_obtain_alias(inode); 945 dentry = d_obtain_root(inode);
946 if (IS_ERR(dentry)) { 946 if (IS_ERR(dentry)) {
947 ret = PTR_ERR(dentry); 947 ret = PTR_ERR(dentry);
948 goto failed_dentry; 948 goto failed_dentry;
diff --git a/fs/super.c b/fs/super.c
index d20d5b11dedf..a371ce6aa919 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,7 +22,6 @@
22 22
23#include <linux/export.h> 23#include <linux/export.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/acct.h>
26#include <linux/blkdev.h> 25#include <linux/blkdev.h>
27#include <linux/mount.h> 26#include <linux/mount.h>
28#include <linux/security.h> 27#include <linux/security.h>
@@ -702,12 +701,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
702 return -EACCES; 701 return -EACCES;
703#endif 702#endif
704 703
705 if (flags & MS_RDONLY)
706 acct_auto_close(sb);
707 shrink_dcache_sb(sb);
708
709 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 704 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
710 705
706 if (remount_ro) {
707 if (sb->s_pins.first) {
708 up_write(&sb->s_umount);
709 sb_pin_kill(sb);
710 down_write(&sb->s_umount);
711 if (!sb->s_root)
712 return 0;
713 if (sb->s_writers.frozen != SB_UNFROZEN)
714 return -EBUSY;
715 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
716 }
717 }
718 shrink_dcache_sb(sb);
719
711 /* If we are remounting RDONLY and current sb is read/write, 720 /* If we are remounting RDONLY and current sb is read/write,
712 make sure there are no rw files opened */ 721 make sure there are no rw files opened */
713 if (remount_ro) { 722 if (remount_ro) {
diff --git a/include/linux/acct.h b/include/linux/acct.h
index 4a5b7cb56079..dccc2d4fe7de 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -24,14 +24,10 @@ struct super_block;
24struct pacct_struct; 24struct pacct_struct;
25struct pid_namespace; 25struct pid_namespace;
26extern int acct_parm[]; /* for sysctl */ 26extern int acct_parm[]; /* for sysctl */
27extern void acct_auto_close_mnt(struct vfsmount *m);
28extern void acct_auto_close(struct super_block *sb);
29extern void acct_collect(long exitcode, int group_dead); 27extern void acct_collect(long exitcode, int group_dead);
30extern void acct_process(void); 28extern void acct_process(void);
31extern void acct_exit_ns(struct pid_namespace *); 29extern void acct_exit_ns(struct pid_namespace *);
32#else 30#else
33#define acct_auto_close_mnt(x) do { } while (0)
34#define acct_auto_close(x) do { } while (0)
35#define acct_collect(x,y) do { } while (0) 31#define acct_collect(x,y) do { } while (0)
36#define acct_process() do { } while (0) 32#define acct_process() do { } while (0)
37#define acct_exit_ns(ns) do { } while (0) 33#define acct_exit_ns(ns) do { } while (0)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3c7ec327ebd2..e4ae2ad48d07 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -249,6 +249,7 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
249extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); 249extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
250extern struct dentry *d_find_any_alias(struct inode *inode); 250extern struct dentry *d_find_any_alias(struct inode *inode);
251extern struct dentry * d_obtain_alias(struct inode *); 251extern struct dentry * d_obtain_alias(struct inode *);
252extern struct dentry * d_obtain_root(struct inode *);
252extern void shrink_dcache_sb(struct super_block *); 253extern void shrink_dcache_sb(struct super_block *);
253extern void shrink_dcache_parent(struct dentry *); 254extern void shrink_dcache_parent(struct dentry *);
254extern void shrink_dcache_for_umount(struct super_block *); 255extern void shrink_dcache_for_umount(struct super_block *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f0890e4a7c25..94187721ad41 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1275,6 +1275,7 @@ struct super_block {
1275 1275
1276 /* AIO completions deferred from interrupt context */ 1276 /* AIO completions deferred from interrupt context */
1277 struct workqueue_struct *s_dio_done_wq; 1277 struct workqueue_struct *s_dio_done_wq;
1278 struct hlist_head s_pins;
1278 1279
1279 /* 1280 /*
1280 * Keep the lru lists last in the structure so they always sit on their 1281 * Keep the lru lists last in the structure so they always sit on their
@@ -2360,6 +2361,7 @@ extern int do_pipe_flags(int *, int);
2360 2361
2361extern int kernel_read(struct file *, loff_t, char *, unsigned long); 2362extern int kernel_read(struct file *, loff_t, char *, unsigned long);
2362extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t); 2363extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
2364extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
2363extern struct file * open_exec(const char *); 2365extern struct file * open_exec(const char *);
2364 2366
2365/* fs/dcache.c -- generic fs support functions */ 2367/* fs/dcache.c -- generic fs support functions */
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
new file mode 100644
index 000000000000..f66525e72ccf
--- /dev/null
+++ b/include/linux/fs_pin.h
@@ -0,0 +1,17 @@
1#include <linux/fs.h>
2
3struct fs_pin {
4 atomic_long_t count;
5 union {
6 struct {
7 struct hlist_node s_list;
8 struct hlist_node m_list;
9 };
10 struct rcu_head rcu;
11 };
12 void (*kill)(struct fs_pin *);
13};
14
15void pin_put(struct fs_pin *);
16void pin_remove(struct fs_pin *);
17void pin_insert(struct fs_pin *, struct vfsmount *);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b0c1e6574e7f..9262e4bf0cc3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -69,6 +69,7 @@ struct vfsmount {
69}; 69};
70 70
71struct file; /* forward dec */ 71struct file; /* forward dec */
72struct path;
72 73
73extern int mnt_want_write(struct vfsmount *mnt); 74extern int mnt_want_write(struct vfsmount *mnt);
74extern int mnt_want_write_file(struct file *file); 75extern int mnt_want_write_file(struct file *file);
@@ -77,8 +78,7 @@ extern void mnt_drop_write(struct vfsmount *mnt);
77extern void mnt_drop_write_file(struct file *file); 78extern void mnt_drop_write_file(struct file *file);
78extern void mntput(struct vfsmount *mnt); 79extern void mntput(struct vfsmount *mnt);
79extern struct vfsmount *mntget(struct vfsmount *mnt); 80extern struct vfsmount *mntget(struct vfsmount *mnt);
80extern void mnt_pin(struct vfsmount *mnt); 81extern struct vfsmount *mnt_clone_internal(struct path *path);
81extern void mnt_unpin(struct vfsmount *mnt);
82extern int __mnt_is_readonly(struct vfsmount *mnt); 82extern int __mnt_is_readonly(struct vfsmount *mnt);
83 83
84struct file_system_type; 84struct file_system_type;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 09a7cffc224e..48d64e6ab292 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -84,7 +84,7 @@ unsigned long iov_iter_alignment(const struct iov_iter *i);
84void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, 84void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
85 unsigned long nr_segs, size_t count); 85 unsigned long nr_segs, size_t count);
86ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, 86ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
87 size_t maxsize, size_t *start); 87 unsigned maxpages, size_t *start);
88ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, 88ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
89 size_t maxsize, size_t *start); 89 size_t maxsize, size_t *start);
90int iov_iter_npages(const struct iov_iter *i, int maxpages); 90int iov_iter_npages(const struct iov_iter *i, int maxpages);
diff --git a/kernel/acct.c b/kernel/acct.c
index 51793520566f..b4c667d22e79 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
59#include <asm/div64.h> 59#include <asm/div64.h>
60#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
61#include <linux/pid_namespace.h> 61#include <linux/pid_namespace.h>
62#include <linux/fs_pin.h>
62 63
63/* 64/*
64 * These constants control the amount of freespace that suspend and 65 * These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
75/* 76/*
76 * External references and all of the globals. 77 * External references and all of the globals.
77 */ 78 */
78static void do_acct_process(struct bsd_acct_struct *acct, 79static void do_acct_process(struct bsd_acct_struct *acct);
79 struct pid_namespace *ns, struct file *);
80 80
81/*
82 * This structure is used so that all the data protected by lock
83 * can be placed in the same cache line as the lock. This primes
84 * the cache line to have the data after getting the lock.
85 */
86struct bsd_acct_struct { 81struct bsd_acct_struct {
82 struct fs_pin pin;
83 struct mutex lock;
87 int active; 84 int active;
88 unsigned long needcheck; 85 unsigned long needcheck;
89 struct file *file; 86 struct file *file;
90 struct pid_namespace *ns; 87 struct pid_namespace *ns;
91 struct list_head list; 88 struct work_struct work;
89 struct completion done;
92}; 90};
93 91
94static DEFINE_SPINLOCK(acct_lock);
95static LIST_HEAD(acct_list);
96
97/* 92/*
98 * Check the amount of free space and suspend/resume accordingly. 93 * Check the amount of free space and suspend/resume accordingly.
99 */ 94 */
100static int check_free_space(struct bsd_acct_struct *acct, struct file *file) 95static int check_free_space(struct bsd_acct_struct *acct)
101{ 96{
102 struct kstatfs sbuf; 97 struct kstatfs sbuf;
103 int res; 98
104 int act; 99 if (time_is_before_jiffies(acct->needcheck))
105 u64 resume;
106 u64 suspend;
107
108 spin_lock(&acct_lock);
109 res = acct->active;
110 if (!file || time_is_before_jiffies(acct->needcheck))
111 goto out; 100 goto out;
112 spin_unlock(&acct_lock);
113 101
114 /* May block */ 102 /* May block */
115 if (vfs_statfs(&file->f_path, &sbuf)) 103 if (vfs_statfs(&acct->file->f_path, &sbuf))
116 return res;
117 suspend = sbuf.f_blocks * SUSPEND;
118 resume = sbuf.f_blocks * RESUME;
119
120 do_div(suspend, 100);
121 do_div(resume, 100);
122
123 if (sbuf.f_bavail <= suspend)
124 act = -1;
125 else if (sbuf.f_bavail >= resume)
126 act = 1;
127 else
128 act = 0;
129
130 /*
131 * If some joker switched acct->file under us we'ld better be
132 * silent and _not_ touch anything.
133 */
134 spin_lock(&acct_lock);
135 if (file != acct->file) {
136 if (act)
137 res = act > 0;
138 goto out; 104 goto out;
139 }
140 105
141 if (acct->active) { 106 if (acct->active) {
142 if (act < 0) { 107 u64 suspend = sbuf.f_blocks * SUSPEND;
108 do_div(suspend, 100);
109 if (sbuf.f_bavail <= suspend) {
143 acct->active = 0; 110 acct->active = 0;
144 pr_info("Process accounting paused\n"); 111 pr_info("Process accounting paused\n");
145 } 112 }
146 } else { 113 } else {
147 if (act > 0) { 114 u64 resume = sbuf.f_blocks * RESUME;
115 do_div(resume, 100);
116 if (sbuf.f_bavail >= resume) {
148 acct->active = 1; 117 acct->active = 1;
149 pr_info("Process accounting resumed\n"); 118 pr_info("Process accounting resumed\n");
150 } 119 }
151 } 120 }
152 121
153 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 122 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
154 res = acct->active;
155out: 123out:
156 spin_unlock(&acct_lock); 124 return acct->active;
125}
126
127static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
128{
129 struct bsd_acct_struct *res;
130again:
131 smp_rmb();
132 rcu_read_lock();
133 res = ACCESS_ONCE(ns->bacct);
134 if (!res) {
135 rcu_read_unlock();
136 return NULL;
137 }
138 if (!atomic_long_inc_not_zero(&res->pin.count)) {
139 rcu_read_unlock();
140 cpu_relax();
141 goto again;
142 }
143 rcu_read_unlock();
144 mutex_lock(&res->lock);
145 if (!res->ns) {
146 mutex_unlock(&res->lock);
147 pin_put(&res->pin);
148 goto again;
149 }
157 return res; 150 return res;
158} 151}
159 152
160/* 153static void close_work(struct work_struct *work)
161 * Close the old accounting file (if currently open) and then replace
162 * it with file (if non-NULL).
163 *
164 * NOTE: acct_lock MUST be held on entry and exit.
165 */
166static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
167 struct pid_namespace *ns)
168{ 154{
169 struct file *old_acct = NULL; 155 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
170 struct pid_namespace *old_ns = NULL; 156 struct file *file = acct->file;
171 157 if (file->f_op->flush)
172 if (acct->file) { 158 file->f_op->flush(file, NULL);
173 old_acct = acct->file; 159 __fput_sync(file);
174 old_ns = acct->ns; 160 complete(&acct->done);
175 acct->active = 0; 161}
176 acct->file = NULL; 162
163static void acct_kill(struct bsd_acct_struct *acct,
164 struct bsd_acct_struct *new)
165{
166 if (acct) {
167 struct pid_namespace *ns = acct->ns;
168 do_acct_process(acct);
169 INIT_WORK(&acct->work, close_work);
170 init_completion(&acct->done);
171 schedule_work(&acct->work);
172 wait_for_completion(&acct->done);
173 pin_remove(&acct->pin);
174 ns->bacct = new;
177 acct->ns = NULL; 175 acct->ns = NULL;
178 list_del(&acct->list); 176 atomic_long_dec(&acct->pin.count);
179 } 177 mutex_unlock(&acct->lock);
180 if (file) { 178 pin_put(&acct->pin);
181 acct->file = file;
182 acct->ns = ns;
183 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
184 acct->active = 1;
185 list_add(&acct->list, &acct_list);
186 } 179 }
187 if (old_acct) { 180}
188 mnt_unpin(old_acct->f_path.mnt); 181
189 spin_unlock(&acct_lock); 182static void acct_pin_kill(struct fs_pin *pin)
190 do_acct_process(acct, old_ns, old_acct); 183{
191 filp_close(old_acct, NULL); 184 struct bsd_acct_struct *acct;
192 spin_lock(&acct_lock); 185 acct = container_of(pin, struct bsd_acct_struct, pin);
186 mutex_lock(&acct->lock);
187 if (!acct->ns) {
188 mutex_unlock(&acct->lock);
189 pin_put(pin);
190 acct = NULL;
193 } 191 }
192 acct_kill(acct, NULL);
194} 193}
195 194
196static int acct_on(struct filename *pathname) 195static int acct_on(struct filename *pathname)
197{ 196{
198 struct file *file; 197 struct file *file;
199 struct vfsmount *mnt; 198 struct vfsmount *mnt, *internal;
200 struct pid_namespace *ns; 199 struct pid_namespace *ns = task_active_pid_ns(current);
201 struct bsd_acct_struct *acct = NULL; 200 struct bsd_acct_struct *acct, *old;
201 int err;
202
203 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
204 if (!acct)
205 return -ENOMEM;
202 206
203 /* Difference from BSD - they don't do O_APPEND */ 207 /* Difference from BSD - they don't do O_APPEND */
204 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 208 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
205 if (IS_ERR(file)) 209 if (IS_ERR(file)) {
210 kfree(acct);
206 return PTR_ERR(file); 211 return PTR_ERR(file);
212 }
207 213
208 if (!S_ISREG(file_inode(file)->i_mode)) { 214 if (!S_ISREG(file_inode(file)->i_mode)) {
215 kfree(acct);
209 filp_close(file, NULL); 216 filp_close(file, NULL);
210 return -EACCES; 217 return -EACCES;
211 } 218 }
212 219
213 if (!file->f_op->write) { 220 if (!file->f_op->write) {
221 kfree(acct);
214 filp_close(file, NULL); 222 filp_close(file, NULL);
215 return -EIO; 223 return -EIO;
216 } 224 }
217 225 internal = mnt_clone_internal(&file->f_path);
218 ns = task_active_pid_ns(current); 226 if (IS_ERR(internal)) {
219 if (ns->bacct == NULL) { 227 kfree(acct);
220 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); 228 filp_close(file, NULL);
221 if (acct == NULL) { 229 return PTR_ERR(internal);
222 filp_close(file, NULL);
223 return -ENOMEM;
224 }
225 } 230 }
226 231 err = mnt_want_write(internal);
227 spin_lock(&acct_lock); 232 if (err) {
228 if (ns->bacct == NULL) { 233 mntput(internal);
229 ns->bacct = acct; 234 kfree(acct);
230 acct = NULL; 235 filp_close(file, NULL);
236 return err;
231 } 237 }
232
233 mnt = file->f_path.mnt; 238 mnt = file->f_path.mnt;
234 mnt_pin(mnt); 239 file->f_path.mnt = internal;
235 acct_file_reopen(ns->bacct, file, ns); 240
236 spin_unlock(&acct_lock); 241 atomic_long_set(&acct->pin.count, 1);
237 242 acct->pin.kill = acct_pin_kill;
238 mntput(mnt); /* it's pinned, now give up active reference */ 243 acct->file = file;
239 kfree(acct); 244 acct->needcheck = jiffies;
240 245 acct->ns = ns;
246 mutex_init(&acct->lock);
247 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
248 pin_insert(&acct->pin, mnt);
249
250 old = acct_get(ns);
251 if (old)
252 acct_kill(old, acct);
253 else
254 ns->bacct = acct;
255 mutex_unlock(&acct->lock);
256 mnt_drop_write(mnt);
257 mntput(mnt);
241 return 0; 258 return 0;
242} 259}
243 260
261static DEFINE_MUTEX(acct_on_mutex);
262
244/** 263/**
245 * sys_acct - enable/disable process accounting 264 * sys_acct - enable/disable process accounting
246 * @name: file name for accounting records or NULL to shutdown accounting 265 * @name: file name for accounting records or NULL to shutdown accounting
@@ -264,78 +283,20 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
264 283
265 if (IS_ERR(tmp)) 284 if (IS_ERR(tmp))
266 return PTR_ERR(tmp); 285 return PTR_ERR(tmp);
286 mutex_lock(&acct_on_mutex);
267 error = acct_on(tmp); 287 error = acct_on(tmp);
288 mutex_unlock(&acct_on_mutex);
268 putname(tmp); 289 putname(tmp);
269 } else { 290 } else {
270 struct bsd_acct_struct *acct; 291 acct_kill(acct_get(task_active_pid_ns(current)), NULL);
271
272 acct = task_active_pid_ns(current)->bacct;
273 if (acct == NULL)
274 return 0;
275
276 spin_lock(&acct_lock);
277 acct_file_reopen(acct, NULL, NULL);
278 spin_unlock(&acct_lock);
279 } 292 }
280 293
281 return error; 294 return error;
282} 295}
283 296
284/**
285 * acct_auto_close - turn off a filesystem's accounting if it is on
286 * @m: vfsmount being shut down
287 *
288 * If the accounting is turned on for a file in the subtree pointed to
289 * to by m, turn accounting off. Done when m is about to die.
290 */
291void acct_auto_close_mnt(struct vfsmount *m)
292{
293 struct bsd_acct_struct *acct;
294
295 spin_lock(&acct_lock);
296restart:
297 list_for_each_entry(acct, &acct_list, list)
298 if (acct->file && acct->file->f_path.mnt == m) {
299 acct_file_reopen(acct, NULL, NULL);
300 goto restart;
301 }
302 spin_unlock(&acct_lock);
303}
304
305/**
306 * acct_auto_close - turn off a filesystem's accounting if it is on
307 * @sb: super block for the filesystem
308 *
309 * If the accounting is turned on for a file in the filesystem pointed
310 * to by sb, turn accounting off.
311 */
312void acct_auto_close(struct super_block *sb)
313{
314 struct bsd_acct_struct *acct;
315
316 spin_lock(&acct_lock);
317restart:
318 list_for_each_entry(acct, &acct_list, list)
319 if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
320 acct_file_reopen(acct, NULL, NULL);
321 goto restart;
322 }
323 spin_unlock(&acct_lock);
324}
325
326void acct_exit_ns(struct pid_namespace *ns) 297void acct_exit_ns(struct pid_namespace *ns)
327{ 298{
328 struct bsd_acct_struct *acct = ns->bacct; 299 acct_kill(acct_get(ns), NULL);
329
330 if (acct == NULL)
331 return;
332
333 spin_lock(&acct_lock);
334 if (acct->file != NULL)
335 acct_file_reopen(acct, NULL, NULL);
336 spin_unlock(&acct_lock);
337
338 kfree(acct);
339} 300}
340 301
341/* 302/*
@@ -450,38 +411,20 @@ static u32 encode_float(u64 value)
450 * do_exit() or when switching to a different output file. 411 * do_exit() or when switching to a different output file.
451 */ 412 */
452 413
453/* 414static void fill_ac(acct_t *ac)
454 * do_acct_process does all actual work. Caller holds the reference to file.
455 */
456static void do_acct_process(struct bsd_acct_struct *acct,
457 struct pid_namespace *ns, struct file *file)
458{ 415{
459 struct pacct_struct *pacct = &current->signal->pacct; 416 struct pacct_struct *pacct = &current->signal->pacct;
460 acct_t ac;
461 mm_segment_t fs;
462 unsigned long flim;
463 u64 elapsed, run_time; 417 u64 elapsed, run_time;
464 struct tty_struct *tty; 418 struct tty_struct *tty;
465 const struct cred *orig_cred;
466
467 /* Perform file operations on behalf of whoever enabled accounting */
468 orig_cred = override_creds(file->f_cred);
469
470 /*
471 * First check to see if there is enough free_space to continue
472 * the process accounting system.
473 */
474 if (!check_free_space(acct, file))
475 goto out;
476 419
477 /* 420 /*
478 * Fill the accounting struct with the needed info as recorded 421 * Fill the accounting struct with the needed info as recorded
479 * by the different kernel functions. 422 * by the different kernel functions.
480 */ 423 */
481 memset(&ac, 0, sizeof(acct_t)); 424 memset(ac, 0, sizeof(acct_t));
482 425
483 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 426 ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
484 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 427 strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
485 428
486 /* calculate run_time in nsec*/ 429 /* calculate run_time in nsec*/
487 run_time = ktime_get_ns(); 430 run_time = ktime_get_ns();
@@ -489,9 +432,9 @@ static void do_acct_process(struct bsd_acct_struct *acct,
489 /* convert nsec -> AHZ */ 432 /* convert nsec -> AHZ */
490 elapsed = nsec_to_AHZ(run_time); 433 elapsed = nsec_to_AHZ(run_time);
491#if ACCT_VERSION == 3 434#if ACCT_VERSION == 3
492 ac.ac_etime = encode_float(elapsed); 435 ac->ac_etime = encode_float(elapsed);
493#else 436#else
494 ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? 437 ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
495 (unsigned long) elapsed : (unsigned long) -1l); 438 (unsigned long) elapsed : (unsigned long) -1l);
496#endif 439#endif
497#if ACCT_VERSION == 1 || ACCT_VERSION == 2 440#if ACCT_VERSION == 1 || ACCT_VERSION == 2
@@ -499,18 +442,58 @@ static void do_acct_process(struct bsd_acct_struct *acct,
499 /* new enlarged etime field */ 442 /* new enlarged etime field */
500 comp2_t etime = encode_comp2_t(elapsed); 443 comp2_t etime = encode_comp2_t(elapsed);
501 444
502 ac.ac_etime_hi = etime >> 16; 445 ac->ac_etime_hi = etime >> 16;
503 ac.ac_etime_lo = (u16) etime; 446 ac->ac_etime_lo = (u16) etime;
504 } 447 }
505#endif 448#endif
506 do_div(elapsed, AHZ); 449 do_div(elapsed, AHZ);
507 ac.ac_btime = get_seconds() - elapsed; 450 ac->ac_btime = get_seconds() - elapsed;
451#if ACCT_VERSION==2
452 ac->ac_ahz = AHZ;
453#endif
454
455 spin_lock_irq(&current->sighand->siglock);
456 tty = current->signal->tty; /* Safe as we hold the siglock */
457 ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
458 ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
459 ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
460 ac->ac_flag = pacct->ac_flag;
461 ac->ac_mem = encode_comp_t(pacct->ac_mem);
462 ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
463 ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
464 ac->ac_exitcode = pacct->ac_exitcode;
465 spin_unlock_irq(&current->sighand->siglock);
466}
467/*
468 * do_acct_process does all actual work. Caller holds the reference to file.
469 */
470static void do_acct_process(struct bsd_acct_struct *acct)
471{
472 acct_t ac;
473 unsigned long flim;
474 const struct cred *orig_cred;
475 struct pid_namespace *ns = acct->ns;
476 struct file *file = acct->file;
477
478 /*
479 * Accounting records are not subject to resource limits.
480 */
481 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
482 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
483 /* Perform file operations on behalf of whoever enabled accounting */
484 orig_cred = override_creds(file->f_cred);
485
486 /*
487 * First check to see if there is enough free_space to continue
488 * the process accounting system.
489 */
490 if (!check_free_space(acct))
491 goto out;
492
493 fill_ac(&ac);
508 /* we really need to bite the bullet and change layout */ 494 /* we really need to bite the bullet and change layout */
509 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); 495 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
510 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); 496 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
511#if ACCT_VERSION == 2
512 ac.ac_ahz = AHZ;
513#endif
514#if ACCT_VERSION == 1 || ACCT_VERSION == 2 497#if ACCT_VERSION == 1 || ACCT_VERSION == 2
515 /* backward-compatible 16 bit fields */ 498 /* backward-compatible 16 bit fields */
516 ac.ac_uid16 = ac.ac_uid; 499 ac.ac_uid16 = ac.ac_uid;
@@ -522,45 +505,18 @@ static void do_acct_process(struct bsd_acct_struct *acct,
522 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); 505 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
523 rcu_read_unlock(); 506 rcu_read_unlock();
524#endif 507#endif
525
526 spin_lock_irq(&current->sighand->siglock);
527 tty = current->signal->tty; /* Safe as we hold the siglock */
528 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
529 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
530 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
531 ac.ac_flag = pacct->ac_flag;
532 ac.ac_mem = encode_comp_t(pacct->ac_mem);
533 ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
534 ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
535 ac.ac_exitcode = pacct->ac_exitcode;
536 spin_unlock_irq(&current->sighand->siglock);
537 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
538 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
539 ac.ac_swaps = encode_comp_t(0);
540
541 /* 508 /*
542 * Get freeze protection. If the fs is frozen, just skip the write 509 * Get freeze protection. If the fs is frozen, just skip the write
543 * as we could deadlock the system otherwise. 510 * as we could deadlock the system otherwise.
544 */ 511 */
545 if (!file_start_write_trylock(file)) 512 if (file_start_write_trylock(file)) {
546 goto out; 513 /* it's been opened O_APPEND, so position is irrelevant */
547 /* 514 loff_t pos = 0;
548 * Kernel segment override to datasegment and write it 515 __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
549 * to the accounting file. 516 file_end_write(file);
550 */ 517 }
551 fs = get_fs();
552 set_fs(KERNEL_DS);
553 /*
554 * Accounting records are not subject to resource limits.
555 */
556 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
557 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
558 file->f_op->write(file, (char *)&ac,
559 sizeof(acct_t), &file->f_pos);
560 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
561 set_fs(fs);
562 file_end_write(file);
563out: 518out:
519 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
564 revert_creds(orig_cred); 520 revert_creds(orig_cred);
565} 521}
566 522
@@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
609 spin_unlock_irq(&current->sighand->siglock); 565 spin_unlock_irq(&current->sighand->siglock);
610} 566}
611 567
612static void acct_process_in_ns(struct pid_namespace *ns) 568static void slow_acct_process(struct pid_namespace *ns)
613{ 569{
614 struct file *file = NULL; 570 for ( ; ns; ns = ns->parent) {
615 struct bsd_acct_struct *acct; 571 struct bsd_acct_struct *acct = acct_get(ns);
616 572 if (acct) {
617 acct = ns->bacct; 573 do_acct_process(acct);
618 /* 574 mutex_unlock(&acct->lock);
619 * accelerate the common fastpath: 575 pin_put(&acct->pin);
620 */ 576 }
621 if (!acct || !acct->file)
622 return;
623
624 spin_lock(&acct_lock);
625 file = acct->file;
626 if (unlikely(!file)) {
627 spin_unlock(&acct_lock);
628 return;
629 } 577 }
630 get_file(file);
631 spin_unlock(&acct_lock);
632
633 do_acct_process(acct, ns, file);
634 fput(file);
635} 578}
636 579
637/** 580/**
638 * acct_process - now just a wrapper around acct_process_in_ns, 581 * acct_process
639 * which in turn is a wrapper around do_acct_process.
640 * 582 *
641 * handles process accounting for an exiting task 583 * handles process accounting for an exiting task
642 */ 584 */
@@ -649,6 +591,10 @@ void acct_process(void)
649 * alive and holds its namespace, which in turn holds 591 * alive and holds its namespace, which in turn holds
650 * its parent. 592 * its parent.
651 */ 593 */
652 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) 594 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
653 acct_process_in_ns(ns); 595 if (ns->bacct)
596 break;
597 }
598 if (unlikely(ns))
599 slow_acct_process(ns);
654} 600}
diff --git a/mm/filemap.c b/mm/filemap.c
index f501b56ec2c6..90effcdf948d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2602,7 +2602,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2602 * that this differs from normal direct-io semantics, which 2602 * that this differs from normal direct-io semantics, which
2603 * will return -EFOO even if some bytes were written. 2603 * will return -EFOO even if some bytes were written.
2604 */ 2604 */
2605 if (unlikely(status < 0) && !written) { 2605 if (unlikely(status < 0)) {
2606 err = status; 2606 err = status;
2607 goto out; 2607 goto out;
2608 } 2608 }
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 7b5dbd1517b5..ab88dc0ea1d3 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
310EXPORT_SYMBOL(iov_iter_init); 310EXPORT_SYMBOL(iov_iter_init);
311 311
312static ssize_t get_pages_iovec(struct iov_iter *i, 312static ssize_t get_pages_iovec(struct iov_iter *i,
313 struct page **pages, size_t maxsize, 313 struct page **pages, unsigned maxpages,
314 size_t *start) 314 size_t *start)
315{ 315{
316 size_t offset = i->iov_offset; 316 size_t offset = i->iov_offset;
@@ -323,10 +323,10 @@ static ssize_t get_pages_iovec(struct iov_iter *i,
323 len = iov->iov_len - offset; 323 len = iov->iov_len - offset;
324 if (len > i->count) 324 if (len > i->count)
325 len = i->count; 325 len = i->count;
326 if (len > maxsize)
327 len = maxsize;
328 addr = (unsigned long)iov->iov_base + offset; 326 addr = (unsigned long)iov->iov_base + offset;
329 len += *start = addr & (PAGE_SIZE - 1); 327 len += *start = addr & (PAGE_SIZE - 1);
328 if (len > maxpages * PAGE_SIZE)
329 len = maxpages * PAGE_SIZE;
330 addr &= ~(PAGE_SIZE - 1); 330 addr &= ~(PAGE_SIZE - 1);
331 n = (len + PAGE_SIZE - 1) / PAGE_SIZE; 331 n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
332 res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); 332 res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
@@ -588,15 +588,14 @@ static unsigned long alignment_bvec(const struct iov_iter *i)
588} 588}
589 589
590static ssize_t get_pages_bvec(struct iov_iter *i, 590static ssize_t get_pages_bvec(struct iov_iter *i,
591 struct page **pages, size_t maxsize, 591 struct page **pages, unsigned maxpages,
592 size_t *start) 592 size_t *start)
593{ 593{
594 const struct bio_vec *bvec = i->bvec; 594 const struct bio_vec *bvec = i->bvec;
595 size_t len = bvec->bv_len - i->iov_offset; 595 size_t len = bvec->bv_len - i->iov_offset;
596 if (len > i->count) 596 if (len > i->count)
597 len = i->count; 597 len = i->count;
598 if (len > maxsize) 598 /* can't be more than PAGE_SIZE */
599 len = maxsize;
600 *start = bvec->bv_offset + i->iov_offset; 599 *start = bvec->bv_offset + i->iov_offset;
601 600
602 get_page(*pages = bvec->bv_page); 601 get_page(*pages = bvec->bv_page);
@@ -712,13 +711,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
712EXPORT_SYMBOL(iov_iter_alignment); 711EXPORT_SYMBOL(iov_iter_alignment);
713 712
714ssize_t iov_iter_get_pages(struct iov_iter *i, 713ssize_t iov_iter_get_pages(struct iov_iter *i,
715 struct page **pages, size_t maxsize, 714 struct page **pages, unsigned maxpages,
716 size_t *start) 715 size_t *start)
717{ 716{
718 if (i->type & ITER_BVEC) 717 if (i->type & ITER_BVEC)
719 return get_pages_bvec(i, pages, maxsize, start); 718 return get_pages_bvec(i, pages, maxpages, start);
720 else 719 else
721 return get_pages_iovec(i, pages, maxsize, start); 720 return get_pages_iovec(i, pages, maxpages, start);
722} 721}
723EXPORT_SYMBOL(iov_iter_get_pages); 722EXPORT_SYMBOL(iov_iter_get_pages);
724 723
diff --git a/mm/shmem.c b/mm/shmem.c
index a42add14331c..0e5fb225007c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2323,17 +2323,45 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2323 return shmem_unlink(dir, dentry); 2323 return shmem_unlink(dir, dentry);
2324} 2324}
2325 2325
2326static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
2327{
2328 bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2329 bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
2330
2331 if (old_dir != new_dir && old_is_dir != new_is_dir) {
2332 if (old_is_dir) {
2333 drop_nlink(old_dir);
2334 inc_nlink(new_dir);
2335 } else {
2336 drop_nlink(new_dir);
2337 inc_nlink(old_dir);
2338 }
2339 }
2340 old_dir->i_ctime = old_dir->i_mtime =
2341 new_dir->i_ctime = new_dir->i_mtime =
2342 old_dentry->d_inode->i_ctime =
2343 new_dentry->d_inode->i_ctime = CURRENT_TIME;
2344
2345 return 0;
2346}
2347
2326/* 2348/*
2327 * The VFS layer already does all the dentry stuff for rename, 2349 * The VFS layer already does all the dentry stuff for rename,
2328 * we just have to decrement the usage count for the target if 2350 * we just have to decrement the usage count for the target if
2329 * it exists so that the VFS layer correctly free's it when it 2351 * it exists so that the VFS layer correctly free's it when it
2330 * gets overwritten. 2352 * gets overwritten.
2331 */ 2353 */
2332static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 2354static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
2333{ 2355{
2334 struct inode *inode = old_dentry->d_inode; 2356 struct inode *inode = old_dentry->d_inode;
2335 int they_are_dirs = S_ISDIR(inode->i_mode); 2357 int they_are_dirs = S_ISDIR(inode->i_mode);
2336 2358
2359 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
2360 return -EINVAL;
2361
2362 if (flags & RENAME_EXCHANGE)
2363 return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
2364
2337 if (!simple_empty(new_dentry)) 2365 if (!simple_empty(new_dentry))
2338 return -ENOTEMPTY; 2366 return -ENOTEMPTY;
2339 2367
@@ -3087,7 +3115,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
3087 .mkdir = shmem_mkdir, 3115 .mkdir = shmem_mkdir,
3088 .rmdir = shmem_rmdir, 3116 .rmdir = shmem_rmdir,
3089 .mknod = shmem_mknod, 3117 .mknod = shmem_mknod,
3090 .rename = shmem_rename, 3118 .rename2 = shmem_rename2,
3091 .tmpfile = shmem_tmpfile, 3119 .tmpfile = shmem_tmpfile,
3092#endif 3120#endif
3093#ifdef CONFIG_TMPFS_XATTR 3121#ifdef CONFIG_TMPFS_XATTR