aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfsd/vfs.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/nfsd/vfs.c')
-rw-r--r--fs/nfsd/vfs.c161
1 files changed, 91 insertions, 70 deletions
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b660435978d2..4145083dcf88 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
55#include <linux/security.h> 55#include <linux/security.h>
56#endif /* CONFIG_NFSD_V4 */ 56#endif /* CONFIG_NFSD_V4 */
57#include <linux/jhash.h> 57#include <linux/jhash.h>
58#include <linux/ima.h>
58 59
59#include <asm/uaccess.h> 60#include <asm/uaccess.h>
60 61
@@ -100,36 +101,35 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
100{ 101{
101 struct svc_export *exp = *expp, *exp2 = NULL; 102 struct svc_export *exp = *expp, *exp2 = NULL;
102 struct dentry *dentry = *dpp; 103 struct dentry *dentry = *dpp;
103 struct vfsmount *mnt = mntget(exp->ex_path.mnt); 104 struct path path = {.mnt = mntget(exp->ex_path.mnt),
104 struct dentry *mounts = dget(dentry); 105 .dentry = dget(dentry)};
105 int err = 0; 106 int err = 0;
106 107
107 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); 108 while (d_mountpoint(path.dentry) && follow_down(&path))
109 ;
108 110
109 exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts); 111 exp2 = rqst_exp_get_by_name(rqstp, &path);
110 if (IS_ERR(exp2)) { 112 if (IS_ERR(exp2)) {
111 if (PTR_ERR(exp2) != -ENOENT) 113 if (PTR_ERR(exp2) != -ENOENT)
112 err = PTR_ERR(exp2); 114 err = PTR_ERR(exp2);
113 dput(mounts); 115 path_put(&path);
114 mntput(mnt);
115 goto out; 116 goto out;
116 } 117 }
117 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 118 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
118 /* successfully crossed mount point */ 119 /* successfully crossed mount point */
119 /* 120 /*
120 * This is subtle: dentry is *not* under mnt at this point. 121 * This is subtle: path.dentry is *not* on path.mnt
121 * The only reason we are safe is that original mnt is pinned 122 * at this point. The only reason we are safe is that
122 * down by exp, so we should dput before putting exp. 123 * original mnt is pinned down by exp, so we should
124 * put path *before* putting exp
123 */ 125 */
124 dput(dentry); 126 *dpp = path.dentry;
125 *dpp = mounts; 127 path.dentry = dentry;
126 exp_put(exp);
127 *expp = exp2; 128 *expp = exp2;
128 } else { 129 exp2 = exp;
129 exp_put(exp2);
130 dput(mounts);
131 } 130 }
132 mntput(mnt); 131 path_put(&path);
132 exp_put(exp2);
133out: 133out:
134 return err; 134 return err;
135} 135}
@@ -168,28 +168,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
168 /* checking mountpoint crossing is very different when stepping up */ 168 /* checking mountpoint crossing is very different when stepping up */
169 struct svc_export *exp2 = NULL; 169 struct svc_export *exp2 = NULL;
170 struct dentry *dp; 170 struct dentry *dp;
171 struct vfsmount *mnt = mntget(exp->ex_path.mnt); 171 struct path path = {.mnt = mntget(exp->ex_path.mnt),
172 dentry = dget(dparent); 172 .dentry = dget(dparent)};
173 while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) 173
174 while (path.dentry == path.mnt->mnt_root &&
175 follow_up(&path))
174 ; 176 ;
175 dp = dget_parent(dentry); 177 dp = dget_parent(path.dentry);
176 dput(dentry); 178 dput(path.dentry);
177 dentry = dp; 179 path.dentry = dp;
178 180
179 exp2 = rqst_exp_parent(rqstp, mnt, dentry); 181 exp2 = rqst_exp_parent(rqstp, &path);
180 if (PTR_ERR(exp2) == -ENOENT) { 182 if (PTR_ERR(exp2) == -ENOENT) {
181 dput(dentry);
182 dentry = dget(dparent); 183 dentry = dget(dparent);
183 } else if (IS_ERR(exp2)) { 184 } else if (IS_ERR(exp2)) {
184 host_err = PTR_ERR(exp2); 185 host_err = PTR_ERR(exp2);
185 dput(dentry); 186 path_put(&path);
186 mntput(mnt);
187 goto out_nfserr; 187 goto out_nfserr;
188 } else { 188 } else {
189 dentry = dget(path.dentry);
189 exp_put(exp); 190 exp_put(exp);
190 exp = exp2; 191 exp = exp2;
191 } 192 }
192 mntput(mnt); 193 path_put(&path);
193 } 194 }
194 } else { 195 } else {
195 fh_lock(fhp); 196 fh_lock(fhp);
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
735 flags, cred); 736 flags, cred);
736 if (IS_ERR(*filp)) 737 if (IS_ERR(*filp))
737 host_err = PTR_ERR(*filp); 738 host_err = PTR_ERR(*filp);
739 else
740 ima_counts_get(*filp);
738out_nfserr: 741out_nfserr:
739 err = nfserrno(host_err); 742 err = nfserrno(host_err);
740out: 743out:
@@ -963,6 +966,43 @@ static void kill_suid(struct dentry *dentry)
963 mutex_unlock(&dentry->d_inode->i_mutex); 966 mutex_unlock(&dentry->d_inode->i_mutex);
964} 967}
965 968
969/*
970 * Gathered writes: If another process is currently writing to the file,
971 * there's a high chance this is another nfsd (triggered by a bulk write
972 * from a client's biod). Rather than syncing the file with each write
973 * request, we sleep for 10 msec.
974 *
975 * I don't know if this roughly approximates C. Juszak's idea of
976 * gathered writes, but it's a nice and simple solution (IMHO), and it
977 * seems to work:-)
978 *
979 * Note: we do this only in the NFSv2 case, since v3 and higher have a
980 * better tool (separate unstable writes and commits) for solving this
981 * problem.
982 */
983static int wait_for_concurrent_writes(struct file *file)
984{
985 struct inode *inode = file->f_path.dentry->d_inode;
986 static ino_t last_ino;
987 static dev_t last_dev;
988 int err = 0;
989
990 if (atomic_read(&inode->i_writecount) > 1
991 || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
992 dprintk("nfsd: write defer %d\n", task_pid_nr(current));
993 msleep(10);
994 dprintk("nfsd: write resume %d\n", task_pid_nr(current));
995 }
996
997 if (inode->i_state & I_DIRTY) {
998 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
999 err = nfsd_sync(file);
1000 }
1001 last_ino = inode->i_ino;
1002 last_dev = inode->i_sb->s_dev;
1003 return err;
1004}
1005
966static __be32 1006static __be32
967nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1007nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
968 loff_t offset, struct kvec *vec, int vlen, 1008 loff_t offset, struct kvec *vec, int vlen,
@@ -975,6 +1015,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
975 __be32 err = 0; 1015 __be32 err = 0;
976 int host_err; 1016 int host_err;
977 int stable = *stablep; 1017 int stable = *stablep;
1018 int use_wgather;
978 1019
979#ifdef MSNFS 1020#ifdef MSNFS
980 err = nfserr_perm; 1021 err = nfserr_perm;
@@ -993,9 +1034,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
993 * - the sync export option has been set, or 1034 * - the sync export option has been set, or
994 * - the client requested O_SYNC behavior (NFSv3 feature). 1035 * - the client requested O_SYNC behavior (NFSv3 feature).
995 * - The file system doesn't support fsync(). 1036 * - The file system doesn't support fsync().
996 * When gathered writes have been configured for this volume, 1037 * When NFSv2 gathered writes have been configured for this volume,
997 * flushing the data to disk is handled separately below. 1038 * flushing the data to disk is handled separately below.
998 */ 1039 */
1040 use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
999 1041
1000 if (!file->f_op->fsync) {/* COMMIT3 cannot work */ 1042 if (!file->f_op->fsync) {/* COMMIT3 cannot work */
1001 stable = 2; 1043 stable = 2;
@@ -1004,7 +1046,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1004 1046
1005 if (!EX_ISSYNC(exp)) 1047 if (!EX_ISSYNC(exp))
1006 stable = 0; 1048 stable = 0;
1007 if (stable && !EX_WGATHER(exp)) { 1049 if (stable && !use_wgather) {
1008 spin_lock(&file->f_lock); 1050 spin_lock(&file->f_lock);
1009 file->f_flags |= O_SYNC; 1051 file->f_flags |= O_SYNC;
1010 spin_unlock(&file->f_lock); 1052 spin_unlock(&file->f_lock);
@@ -1014,52 +1056,20 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1014 oldfs = get_fs(); set_fs(KERNEL_DS); 1056 oldfs = get_fs(); set_fs(KERNEL_DS);
1015 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 1057 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
1016 set_fs(oldfs); 1058 set_fs(oldfs);
1017 if (host_err >= 0) { 1059 if (host_err < 0)
1018 *cnt = host_err; 1060 goto out_nfserr;
1019 nfsdstats.io_write += host_err; 1061 *cnt = host_err;
1020 fsnotify_modify(file->f_path.dentry); 1062 nfsdstats.io_write += host_err;
1021 } 1063 fsnotify_modify(file->f_path.dentry);
1022 1064
1023 /* clear setuid/setgid flag after write */ 1065 /* clear setuid/setgid flag after write */
1024 if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) 1066 if (inode->i_mode & (S_ISUID | S_ISGID))
1025 kill_suid(dentry); 1067 kill_suid(dentry);
1026 1068
1027 if (host_err >= 0 && stable) { 1069 if (stable && use_wgather)
1028 static ino_t last_ino; 1070 host_err = wait_for_concurrent_writes(file);
1029 static dev_t last_dev;
1030
1031 /*
1032 * Gathered writes: If another process is currently
1033 * writing to the file, there's a high chance
1034 * this is another nfsd (triggered by a bulk write
1035 * from a client's biod). Rather than syncing the
1036 * file with each write request, we sleep for 10 msec.
1037 *
1038 * I don't know if this roughly approximates
1039 * C. Juszak's idea of gathered writes, but it's a
1040 * nice and simple solution (IMHO), and it seems to
1041 * work:-)
1042 */
1043 if (EX_WGATHER(exp)) {
1044 if (atomic_read(&inode->i_writecount) > 1
1045 || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
1046 dprintk("nfsd: write defer %d\n", task_pid_nr(current));
1047 msleep(10);
1048 dprintk("nfsd: write resume %d\n", task_pid_nr(current));
1049 }
1050
1051 if (inode->i_state & I_DIRTY) {
1052 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1053 host_err=nfsd_sync(file);
1054 }
1055#if 0
1056 wake_up(&inode->i_wait);
1057#endif
1058 }
1059 last_ino = inode->i_ino;
1060 last_dev = inode->i_sb->s_dev;
1061 }
1062 1071
1072out_nfserr:
1063 dprintk("nfsd: write complete host_err=%d\n", host_err); 1073 dprintk("nfsd: write complete host_err=%d\n", host_err);
1064 if (host_err >= 0) 1074 if (host_err >= 0)
1065 err = 0; 1075 err = 0;
@@ -2024,6 +2034,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2024 struct dentry *dentry, int acc) 2034 struct dentry *dentry, int acc)
2025{ 2035{
2026 struct inode *inode = dentry->d_inode; 2036 struct inode *inode = dentry->d_inode;
2037 struct path path;
2027 int err; 2038 int err;
2028 2039
2029 if (acc == NFSD_MAY_NOP) 2040 if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2107,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2096 if (err == -EACCES && S_ISREG(inode->i_mode) && 2107 if (err == -EACCES && S_ISREG(inode->i_mode) &&
2097 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) 2108 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
2098 err = inode_permission(inode, MAY_EXEC); 2109 err = inode_permission(inode, MAY_EXEC);
2110 if (err)
2111 goto nfsd_out;
2099 2112
2113 /* Do integrity (permission) checking now, but defer incrementing
2114 * IMA counts to the actual file open.
2115 */
2116 path.mnt = exp->ex_path.mnt;
2117 path.dentry = dentry;
2118 err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
2119 IMA_COUNT_LEAVE);
2120nfsd_out:
2100 return err? nfserrno(err) : 0; 2121 return err? nfserrno(err) : 0;
2101} 2122}
2102 2123