aboutsummaryrefslogtreecommitdiffstats
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c316
1 files changed, 314 insertions, 2 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 94f026ec990a..678f7ce060f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/acct.h> 18#include <linux/acct.h>
19#include <linux/capability.h> 19#include <linux/capability.h>
20#include <linux/cpumask.h>
20#include <linux/module.h> 21#include <linux/module.h>
21#include <linux/sysfs.h> 22#include <linux/sysfs.h>
22#include <linux/seq_file.h> 23#include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
55 return tmp & (HASH_SIZE - 1); 56 return tmp & (HASH_SIZE - 1);
56} 57}
57 58
59#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
60
58struct vfsmount *alloc_vfsmnt(const char *name) 61struct vfsmount *alloc_vfsmnt(const char *name)
59{ 62{
60 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 63 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
68 INIT_LIST_HEAD(&mnt->mnt_share); 71 INIT_LIST_HEAD(&mnt->mnt_share);
69 INIT_LIST_HEAD(&mnt->mnt_slave_list); 72 INIT_LIST_HEAD(&mnt->mnt_slave_list);
70 INIT_LIST_HEAD(&mnt->mnt_slave); 73 INIT_LIST_HEAD(&mnt->mnt_slave);
74 atomic_set(&mnt->__mnt_writers, 0);
71 if (name) { 75 if (name) {
72 int size = strlen(name) + 1; 76 int size = strlen(name) + 1;
73 char *newname = kmalloc(size, GFP_KERNEL); 77 char *newname = kmalloc(size, GFP_KERNEL);
@@ -80,6 +84,263 @@ struct vfsmount *alloc_vfsmnt(const char *name)
80 return mnt; 84 return mnt;
81} 85}
82 86
87/*
88 * Most r/o checks on a fs are for operations that take
89 * discrete amounts of time, like a write() or unlink().
90 * We must keep track of when those operations start
91 * (for permission checks) and when they end, so that
92 * we can determine when writes are able to occur to
93 * a filesystem.
94 */
95/*
96 * __mnt_is_readonly: check whether a mount is read-only
97 * @mnt: the mount to check for its write status
98 *
99 * This shouldn't be used directly ouside of the VFS.
100 * It does not guarantee that the filesystem will stay
101 * r/w, just that it is right *now*. This can not and
102 * should not be used in place of IS_RDONLY(inode).
103 * mnt_want/drop_write() will _keep_ the filesystem
104 * r/w.
105 */
106int __mnt_is_readonly(struct vfsmount *mnt)
107{
108 if (mnt->mnt_flags & MNT_READONLY)
109 return 1;
110 if (mnt->mnt_sb->s_flags & MS_RDONLY)
111 return 1;
112 return 0;
113}
114EXPORT_SYMBOL_GPL(__mnt_is_readonly);
115
116struct mnt_writer {
117 /*
118 * If holding multiple instances of this lock, they
119 * must be ordered by cpu number.
120 */
121 spinlock_t lock;
122 struct lock_class_key lock_class; /* compiles out with !lockdep */
123 unsigned long count;
124 struct vfsmount *mnt;
125} ____cacheline_aligned_in_smp;
126static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
127
128static int __init init_mnt_writers(void)
129{
130 int cpu;
131 for_each_possible_cpu(cpu) {
132 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
133 spin_lock_init(&writer->lock);
134 lockdep_set_class(&writer->lock, &writer->lock_class);
135 writer->count = 0;
136 }
137 return 0;
138}
139fs_initcall(init_mnt_writers);
140
141static void unlock_mnt_writers(void)
142{
143 int cpu;
144 struct mnt_writer *cpu_writer;
145
146 for_each_possible_cpu(cpu) {
147 cpu_writer = &per_cpu(mnt_writers, cpu);
148 spin_unlock(&cpu_writer->lock);
149 }
150}
151
152static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
153{
154 if (!cpu_writer->mnt)
155 return;
156 /*
157 * This is in case anyone ever leaves an invalid,
158 * old ->mnt and a count of 0.
159 */
160 if (!cpu_writer->count)
161 return;
162 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
163 cpu_writer->count = 0;
164}
165 /*
166 * must hold cpu_writer->lock
167 */
168static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
169 struct vfsmount *mnt)
170{
171 if (cpu_writer->mnt == mnt)
172 return;
173 __clear_mnt_count(cpu_writer);
174 cpu_writer->mnt = mnt;
175}
176
177/*
178 * Most r/o checks on a fs are for operations that take
179 * discrete amounts of time, like a write() or unlink().
180 * We must keep track of when those operations start
181 * (for permission checks) and when they end, so that
182 * we can determine when writes are able to occur to
183 * a filesystem.
184 */
185/**
186 * mnt_want_write - get write access to a mount
187 * @mnt: the mount on which to take a write
188 *
189 * This tells the low-level filesystem that a write is
190 * about to be performed to it, and makes sure that
191 * writes are allowed before returning success. When
192 * the write operation is finished, mnt_drop_write()
193 * must be called. This is effectively a refcount.
194 */
195int mnt_want_write(struct vfsmount *mnt)
196{
197 int ret = 0;
198 struct mnt_writer *cpu_writer;
199
200 cpu_writer = &get_cpu_var(mnt_writers);
201 spin_lock(&cpu_writer->lock);
202 if (__mnt_is_readonly(mnt)) {
203 ret = -EROFS;
204 goto out;
205 }
206 use_cpu_writer_for_mount(cpu_writer, mnt);
207 cpu_writer->count++;
208out:
209 spin_unlock(&cpu_writer->lock);
210 put_cpu_var(mnt_writers);
211 return ret;
212}
213EXPORT_SYMBOL_GPL(mnt_want_write);
214
215static void lock_mnt_writers(void)
216{
217 int cpu;
218 struct mnt_writer *cpu_writer;
219
220 for_each_possible_cpu(cpu) {
221 cpu_writer = &per_cpu(mnt_writers, cpu);
222 spin_lock(&cpu_writer->lock);
223 __clear_mnt_count(cpu_writer);
224 cpu_writer->mnt = NULL;
225 }
226}
227
228/*
229 * These per-cpu write counts are not guaranteed to have
230 * matched increments and decrements on any given cpu.
231 * A file open()ed for write on one cpu and close()d on
232 * another cpu will imbalance this count. Make sure it
233 * does not get too far out of whack.
234 */
235static void handle_write_count_underflow(struct vfsmount *mnt)
236{
237 if (atomic_read(&mnt->__mnt_writers) >=
238 MNT_WRITER_UNDERFLOW_LIMIT)
239 return;
240 /*
241 * It isn't necessary to hold all of the locks
242 * at the same time, but doing it this way makes
243 * us share a lot more code.
244 */
245 lock_mnt_writers();
246 /*
247 * vfsmount_lock is for mnt_flags.
248 */
249 spin_lock(&vfsmount_lock);
250 /*
251 * If coalescing the per-cpu writer counts did not
252 * get us back to a positive writer count, we have
253 * a bug.
254 */
255 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
256 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
257 printk(KERN_DEBUG "leak detected on mount(%p) writers "
258 "count: %d\n",
259 mnt, atomic_read(&mnt->__mnt_writers));
260 WARN_ON(1);
261 /* use the flag to keep the dmesg spam down */
262 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
263 }
264 spin_unlock(&vfsmount_lock);
265 unlock_mnt_writers();
266}
267
268/**
269 * mnt_drop_write - give up write access to a mount
270 * @mnt: the mount on which to give up write access
271 *
272 * Tells the low-level filesystem that we are done
273 * performing writes to it. Must be matched with
274 * mnt_want_write() call above.
275 */
276void mnt_drop_write(struct vfsmount *mnt)
277{
278 int must_check_underflow = 0;
279 struct mnt_writer *cpu_writer;
280
281 cpu_writer = &get_cpu_var(mnt_writers);
282 spin_lock(&cpu_writer->lock);
283
284 use_cpu_writer_for_mount(cpu_writer, mnt);
285 if (cpu_writer->count > 0) {
286 cpu_writer->count--;
287 } else {
288 must_check_underflow = 1;
289 atomic_dec(&mnt->__mnt_writers);
290 }
291
292 spin_unlock(&cpu_writer->lock);
293 /*
294 * Logically, we could call this each time,
295 * but the __mnt_writers cacheline tends to
296 * be cold, and makes this expensive.
297 */
298 if (must_check_underflow)
299 handle_write_count_underflow(mnt);
300 /*
301 * This could be done right after the spinlock
302 * is taken because the spinlock keeps us on
303 * the cpu, and disables preemption. However,
304 * putting it here bounds the amount that
305 * __mnt_writers can underflow. Without it,
306 * we could theoretically wrap __mnt_writers.
307 */
308 put_cpu_var(mnt_writers);
309}
310EXPORT_SYMBOL_GPL(mnt_drop_write);
311
312static int mnt_make_readonly(struct vfsmount *mnt)
313{
314 int ret = 0;
315
316 lock_mnt_writers();
317 /*
318 * With all the locks held, this value is stable
319 */
320 if (atomic_read(&mnt->__mnt_writers) > 0) {
321 ret = -EBUSY;
322 goto out;
323 }
324 /*
325 * nobody can do a successful mnt_want_write() with all
326 * of the counts in MNT_DENIED_WRITE and the locks held.
327 */
328 spin_lock(&vfsmount_lock);
329 if (!ret)
330 mnt->mnt_flags |= MNT_READONLY;
331 spin_unlock(&vfsmount_lock);
332out:
333 unlock_mnt_writers();
334 return ret;
335}
336
337static void __mnt_unmake_readonly(struct vfsmount *mnt)
338{
339 spin_lock(&vfsmount_lock);
340 mnt->mnt_flags &= ~MNT_READONLY;
341 spin_unlock(&vfsmount_lock);
342}
343
83int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 344int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
84{ 345{
85 mnt->mnt_sb = sb; 346 mnt->mnt_sb = sb;
@@ -271,7 +532,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
271 532
272static inline void __mntput(struct vfsmount *mnt) 533static inline void __mntput(struct vfsmount *mnt)
273{ 534{
535 int cpu;
274 struct super_block *sb = mnt->mnt_sb; 536 struct super_block *sb = mnt->mnt_sb;
537 /*
538 * We don't have to hold all of the locks at the
539 * same time here because we know that we're the
540 * last reference to mnt and that no new writers
541 * can come in.
542 */
543 for_each_possible_cpu(cpu) {
544 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
545 if (cpu_writer->mnt != mnt)
546 continue;
547 spin_lock(&cpu_writer->lock);
548 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
549 cpu_writer->count = 0;
550 /*
551 * Might as well do this so that no one
552 * ever sees the pointer and expects
553 * it to be valid.
554 */
555 cpu_writer->mnt = NULL;
556 spin_unlock(&cpu_writer->lock);
557 }
558 /*
559 * This probably indicates that somebody messed
560 * up a mnt_want/drop_write() pair. If this
561 * happens, the filesystem was probably unable
562 * to make r/w->r/o transitions.
563 */
564 WARN_ON(atomic_read(&mnt->__mnt_writers));
275 dput(mnt->mnt_root); 565 dput(mnt->mnt_root);
276 free_vfsmnt(mnt); 566 free_vfsmnt(mnt);
277 deactivate_super(sb); 567 deactivate_super(sb);
@@ -417,7 +707,7 @@ static int show_vfsmnt(struct seq_file *m, void *v)
417 seq_putc(m, '.'); 707 seq_putc(m, '.');
418 mangle(m, mnt->mnt_sb->s_subtype); 708 mangle(m, mnt->mnt_sb->s_subtype);
419 } 709 }
420 seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); 710 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
421 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { 711 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
422 if (mnt->mnt_sb->s_flags & fs_infop->flag) 712 if (mnt->mnt_sb->s_flags & fs_infop->flag)
423 seq_puts(m, fs_infop->str); 713 seq_puts(m, fs_infop->str);
@@ -1019,6 +1309,23 @@ out:
1019 return err; 1309 return err;
1020} 1310}
1021 1311
1312static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
1313{
1314 int error = 0;
1315 int readonly_request = 0;
1316
1317 if (ms_flags & MS_RDONLY)
1318 readonly_request = 1;
1319 if (readonly_request == __mnt_is_readonly(mnt))
1320 return 0;
1321
1322 if (readonly_request)
1323 error = mnt_make_readonly(mnt);
1324 else
1325 __mnt_unmake_readonly(mnt);
1326 return error;
1327}
1328
1022/* 1329/*
1023 * change filesystem flags. dir should be a physical root of filesystem. 1330 * change filesystem flags. dir should be a physical root of filesystem.
1024 * If you've mounted a non-root directory somewhere and want to do remount 1331 * If you've mounted a non-root directory somewhere and want to do remount
@@ -1041,7 +1348,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
1041 return -EINVAL; 1348 return -EINVAL;
1042 1349
1043 down_write(&sb->s_umount); 1350 down_write(&sb->s_umount);
1044 err = do_remount_sb(sb, flags, data, 0); 1351 if (flags & MS_BIND)
1352 err = change_mount_flags(nd->path.mnt, flags);
1353 else
1354 err = do_remount_sb(sb, flags, data, 0);
1045 if (!err) 1355 if (!err)
1046 nd->path.mnt->mnt_flags = mnt_flags; 1356 nd->path.mnt->mnt_flags = mnt_flags;
1047 up_write(&sb->s_umount); 1357 up_write(&sb->s_umount);
@@ -1425,6 +1735,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1425 mnt_flags |= MNT_NODIRATIME; 1735 mnt_flags |= MNT_NODIRATIME;
1426 if (flags & MS_RELATIME) 1736 if (flags & MS_RELATIME)
1427 mnt_flags |= MNT_RELATIME; 1737 mnt_flags |= MNT_RELATIME;
1738 if (flags & MS_RDONLY)
1739 mnt_flags |= MNT_READONLY;
1428 1740
1429 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1741 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
1430 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); 1742 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);