diff options
Diffstat (limited to 'fs/nfsd/vfs.c')
-rw-r--r-- | fs/nfsd/vfs.c | 161 |
1 files changed, 91 insertions, 70 deletions
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b660435978d2..4145083dcf88 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/security.h> | 55 | #include <linux/security.h> |
56 | #endif /* CONFIG_NFSD_V4 */ | 56 | #endif /* CONFIG_NFSD_V4 */ |
57 | #include <linux/jhash.h> | 57 | #include <linux/jhash.h> |
58 | #include <linux/ima.h> | ||
58 | 59 | ||
59 | #include <asm/uaccess.h> | 60 | #include <asm/uaccess.h> |
60 | 61 | ||
@@ -100,36 +101,35 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, | |||
100 | { | 101 | { |
101 | struct svc_export *exp = *expp, *exp2 = NULL; | 102 | struct svc_export *exp = *expp, *exp2 = NULL; |
102 | struct dentry *dentry = *dpp; | 103 | struct dentry *dentry = *dpp; |
103 | struct vfsmount *mnt = mntget(exp->ex_path.mnt); | 104 | struct path path = {.mnt = mntget(exp->ex_path.mnt), |
104 | struct dentry *mounts = dget(dentry); | 105 | .dentry = dget(dentry)}; |
105 | int err = 0; | 106 | int err = 0; |
106 | 107 | ||
107 | while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); | 108 | while (d_mountpoint(path.dentry) && follow_down(&path)) |
109 | ; | ||
108 | 110 | ||
109 | exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts); | 111 | exp2 = rqst_exp_get_by_name(rqstp, &path); |
110 | if (IS_ERR(exp2)) { | 112 | if (IS_ERR(exp2)) { |
111 | if (PTR_ERR(exp2) != -ENOENT) | 113 | if (PTR_ERR(exp2) != -ENOENT) |
112 | err = PTR_ERR(exp2); | 114 | err = PTR_ERR(exp2); |
113 | dput(mounts); | 115 | path_put(&path); |
114 | mntput(mnt); | ||
115 | goto out; | 116 | goto out; |
116 | } | 117 | } |
117 | if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { | 118 | if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { |
118 | /* successfully crossed mount point */ | 119 | /* successfully crossed mount point */ |
119 | /* | 120 | /* |
120 | * This is subtle: dentry is *not* under mnt at this point. | 121 | * This is subtle: path.dentry is *not* on path.mnt |
121 | * The only reason we are safe is that original mnt is pinned | 122 | * at this point. The only reason we are safe is that |
122 | * down by exp, so we should dput before putting exp. | 123 | * original mnt is pinned down by exp, so we should |
124 | * put path *before* putting exp | ||
123 | */ | 125 | */ |
124 | dput(dentry); | 126 | *dpp = path.dentry; |
125 | *dpp = mounts; | 127 | path.dentry = dentry; |
126 | exp_put(exp); | ||
127 | *expp = exp2; | 128 | *expp = exp2; |
128 | } else { | 129 | exp2 = exp; |
129 | exp_put(exp2); | ||
130 | dput(mounts); | ||
131 | } | 130 | } |
132 | mntput(mnt); | 131 | path_put(&path); |
132 | exp_put(exp2); | ||
133 | out: | 133 | out: |
134 | return err; | 134 | return err; |
135 | } | 135 | } |
@@ -168,28 +168,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, | |||
168 | /* checking mountpoint crossing is very different when stepping up */ | 168 | /* checking mountpoint crossing is very different when stepping up */ |
169 | struct svc_export *exp2 = NULL; | 169 | struct svc_export *exp2 = NULL; |
170 | struct dentry *dp; | 170 | struct dentry *dp; |
171 | struct vfsmount *mnt = mntget(exp->ex_path.mnt); | 171 | struct path path = {.mnt = mntget(exp->ex_path.mnt), |
172 | dentry = dget(dparent); | 172 | .dentry = dget(dparent)}; |
173 | while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) | 173 | |
174 | while (path.dentry == path.mnt->mnt_root && | ||
175 | follow_up(&path)) | ||
174 | ; | 176 | ; |
175 | dp = dget_parent(dentry); | 177 | dp = dget_parent(path.dentry); |
176 | dput(dentry); | 178 | dput(path.dentry); |
177 | dentry = dp; | 179 | path.dentry = dp; |
178 | 180 | ||
179 | exp2 = rqst_exp_parent(rqstp, mnt, dentry); | 181 | exp2 = rqst_exp_parent(rqstp, &path); |
180 | if (PTR_ERR(exp2) == -ENOENT) { | 182 | if (PTR_ERR(exp2) == -ENOENT) { |
181 | dput(dentry); | ||
182 | dentry = dget(dparent); | 183 | dentry = dget(dparent); |
183 | } else if (IS_ERR(exp2)) { | 184 | } else if (IS_ERR(exp2)) { |
184 | host_err = PTR_ERR(exp2); | 185 | host_err = PTR_ERR(exp2); |
185 | dput(dentry); | 186 | path_put(&path); |
186 | mntput(mnt); | ||
187 | goto out_nfserr; | 187 | goto out_nfserr; |
188 | } else { | 188 | } else { |
189 | dentry = dget(path.dentry); | ||
189 | exp_put(exp); | 190 | exp_put(exp); |
190 | exp = exp2; | 191 | exp = exp2; |
191 | } | 192 | } |
192 | mntput(mnt); | 193 | path_put(&path); |
193 | } | 194 | } |
194 | } else { | 195 | } else { |
195 | fh_lock(fhp); | 196 | fh_lock(fhp); |
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, | |||
735 | flags, cred); | 736 | flags, cred); |
736 | if (IS_ERR(*filp)) | 737 | if (IS_ERR(*filp)) |
737 | host_err = PTR_ERR(*filp); | 738 | host_err = PTR_ERR(*filp); |
739 | else | ||
740 | ima_counts_get(*filp); | ||
738 | out_nfserr: | 741 | out_nfserr: |
739 | err = nfserrno(host_err); | 742 | err = nfserrno(host_err); |
740 | out: | 743 | out: |
@@ -963,6 +966,43 @@ static void kill_suid(struct dentry *dentry) | |||
963 | mutex_unlock(&dentry->d_inode->i_mutex); | 966 | mutex_unlock(&dentry->d_inode->i_mutex); |
964 | } | 967 | } |
965 | 968 | ||
969 | /* | ||
970 | * Gathered writes: If another process is currently writing to the file, | ||
971 | * there's a high chance this is another nfsd (triggered by a bulk write | ||
972 | * from a client's biod). Rather than syncing the file with each write | ||
973 | * request, we sleep for 10 msec. | ||
974 | * | ||
975 | * I don't know if this roughly approximates C. Juszak's idea of | ||
976 | * gathered writes, but it's a nice and simple solution (IMHO), and it | ||
977 | * seems to work:-) | ||
978 | * | ||
979 | * Note: we do this only in the NFSv2 case, since v3 and higher have a | ||
980 | * better tool (separate unstable writes and commits) for solving this | ||
981 | * problem. | ||
982 | */ | ||
983 | static int wait_for_concurrent_writes(struct file *file) | ||
984 | { | ||
985 | struct inode *inode = file->f_path.dentry->d_inode; | ||
986 | static ino_t last_ino; | ||
987 | static dev_t last_dev; | ||
988 | int err = 0; | ||
989 | |||
990 | if (atomic_read(&inode->i_writecount) > 1 | ||
991 | || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { | ||
992 | dprintk("nfsd: write defer %d\n", task_pid_nr(current)); | ||
993 | msleep(10); | ||
994 | dprintk("nfsd: write resume %d\n", task_pid_nr(current)); | ||
995 | } | ||
996 | |||
997 | if (inode->i_state & I_DIRTY) { | ||
998 | dprintk("nfsd: write sync %d\n", task_pid_nr(current)); | ||
999 | err = nfsd_sync(file); | ||
1000 | } | ||
1001 | last_ino = inode->i_ino; | ||
1002 | last_dev = inode->i_sb->s_dev; | ||
1003 | return err; | ||
1004 | } | ||
1005 | |||
966 | static __be32 | 1006 | static __be32 |
967 | nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | 1007 | nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, |
968 | loff_t offset, struct kvec *vec, int vlen, | 1008 | loff_t offset, struct kvec *vec, int vlen, |
@@ -975,6 +1015,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | |||
975 | __be32 err = 0; | 1015 | __be32 err = 0; |
976 | int host_err; | 1016 | int host_err; |
977 | int stable = *stablep; | 1017 | int stable = *stablep; |
1018 | int use_wgather; | ||
978 | 1019 | ||
979 | #ifdef MSNFS | 1020 | #ifdef MSNFS |
980 | err = nfserr_perm; | 1021 | err = nfserr_perm; |
@@ -993,9 +1034,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | |||
993 | * - the sync export option has been set, or | 1034 | * - the sync export option has been set, or |
994 | * - the client requested O_SYNC behavior (NFSv3 feature). | 1035 | * - the client requested O_SYNC behavior (NFSv3 feature). |
995 | * - The file system doesn't support fsync(). | 1036 | * - The file system doesn't support fsync(). |
996 | * When gathered writes have been configured for this volume, | 1037 | * When NFSv2 gathered writes have been configured for this volume, |
997 | * flushing the data to disk is handled separately below. | 1038 | * flushing the data to disk is handled separately below. |
998 | */ | 1039 | */ |
1040 | use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); | ||
999 | 1041 | ||
1000 | if (!file->f_op->fsync) {/* COMMIT3 cannot work */ | 1042 | if (!file->f_op->fsync) {/* COMMIT3 cannot work */ |
1001 | stable = 2; | 1043 | stable = 2; |
@@ -1004,7 +1046,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | |||
1004 | 1046 | ||
1005 | if (!EX_ISSYNC(exp)) | 1047 | if (!EX_ISSYNC(exp)) |
1006 | stable = 0; | 1048 | stable = 0; |
1007 | if (stable && !EX_WGATHER(exp)) { | 1049 | if (stable && !use_wgather) { |
1008 | spin_lock(&file->f_lock); | 1050 | spin_lock(&file->f_lock); |
1009 | file->f_flags |= O_SYNC; | 1051 | file->f_flags |= O_SYNC; |
1010 | spin_unlock(&file->f_lock); | 1052 | spin_unlock(&file->f_lock); |
@@ -1014,52 +1056,20 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | |||
1014 | oldfs = get_fs(); set_fs(KERNEL_DS); | 1056 | oldfs = get_fs(); set_fs(KERNEL_DS); |
1015 | host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); | 1057 | host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); |
1016 | set_fs(oldfs); | 1058 | set_fs(oldfs); |
1017 | if (host_err >= 0) { | 1059 | if (host_err < 0) |
1018 | *cnt = host_err; | 1060 | goto out_nfserr; |
1019 | nfsdstats.io_write += host_err; | 1061 | *cnt = host_err; |
1020 | fsnotify_modify(file->f_path.dentry); | 1062 | nfsdstats.io_write += host_err; |
1021 | } | 1063 | fsnotify_modify(file->f_path.dentry); |
1022 | 1064 | ||
1023 | /* clear setuid/setgid flag after write */ | 1065 | /* clear setuid/setgid flag after write */ |
1024 | if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) | 1066 | if (inode->i_mode & (S_ISUID | S_ISGID)) |
1025 | kill_suid(dentry); | 1067 | kill_suid(dentry); |
1026 | 1068 | ||
1027 | if (host_err >= 0 && stable) { | 1069 | if (stable && use_wgather) |
1028 | static ino_t last_ino; | 1070 | host_err = wait_for_concurrent_writes(file); |
1029 | static dev_t last_dev; | ||
1030 | |||
1031 | /* | ||
1032 | * Gathered writes: If another process is currently | ||
1033 | * writing to the file, there's a high chance | ||
1034 | * this is another nfsd (triggered by a bulk write | ||
1035 | * from a client's biod). Rather than syncing the | ||
1036 | * file with each write request, we sleep for 10 msec. | ||
1037 | * | ||
1038 | * I don't know if this roughly approximates | ||
1039 | * C. Juszak's idea of gathered writes, but it's a | ||
1040 | * nice and simple solution (IMHO), and it seems to | ||
1041 | * work:-) | ||
1042 | */ | ||
1043 | if (EX_WGATHER(exp)) { | ||
1044 | if (atomic_read(&inode->i_writecount) > 1 | ||
1045 | || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { | ||
1046 | dprintk("nfsd: write defer %d\n", task_pid_nr(current)); | ||
1047 | msleep(10); | ||
1048 | dprintk("nfsd: write resume %d\n", task_pid_nr(current)); | ||
1049 | } | ||
1050 | |||
1051 | if (inode->i_state & I_DIRTY) { | ||
1052 | dprintk("nfsd: write sync %d\n", task_pid_nr(current)); | ||
1053 | host_err=nfsd_sync(file); | ||
1054 | } | ||
1055 | #if 0 | ||
1056 | wake_up(&inode->i_wait); | ||
1057 | #endif | ||
1058 | } | ||
1059 | last_ino = inode->i_ino; | ||
1060 | last_dev = inode->i_sb->s_dev; | ||
1061 | } | ||
1062 | 1071 | ||
1072 | out_nfserr: | ||
1063 | dprintk("nfsd: write complete host_err=%d\n", host_err); | 1073 | dprintk("nfsd: write complete host_err=%d\n", host_err); |
1064 | if (host_err >= 0) | 1074 | if (host_err >= 0) |
1065 | err = 0; | 1075 | err = 0; |
@@ -2024,6 +2034,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, | |||
2024 | struct dentry *dentry, int acc) | 2034 | struct dentry *dentry, int acc) |
2025 | { | 2035 | { |
2026 | struct inode *inode = dentry->d_inode; | 2036 | struct inode *inode = dentry->d_inode; |
2037 | struct path path; | ||
2027 | int err; | 2038 | int err; |
2028 | 2039 | ||
2029 | if (acc == NFSD_MAY_NOP) | 2040 | if (acc == NFSD_MAY_NOP) |
@@ -2096,7 +2107,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, | |||
2096 | if (err == -EACCES && S_ISREG(inode->i_mode) && | 2107 | if (err == -EACCES && S_ISREG(inode->i_mode) && |
2097 | acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) | 2108 | acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) |
2098 | err = inode_permission(inode, MAY_EXEC); | 2109 | err = inode_permission(inode, MAY_EXEC); |
2110 | if (err) | ||
2111 | goto nfsd_out; | ||
2099 | 2112 | ||
2113 | /* Do integrity (permission) checking now, but defer incrementing | ||
2114 | * IMA counts to the actual file open. | ||
2115 | */ | ||
2116 | path.mnt = exp->ex_path.mnt; | ||
2117 | path.dentry = dentry; | ||
2118 | err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC), | ||
2119 | IMA_COUNT_LEAVE); | ||
2120 | nfsd_out: | ||
2100 | return err? nfserrno(err) : 0; | 2121 | return err? nfserrno(err) : 0; |
2101 | } | 2122 | } |
2102 | 2123 | ||