aboutsummaryrefslogtreecommitdiffstats
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c516
1 files changed, 383 insertions, 133 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 7953c96a2071..678f7ce060f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/acct.h> 18#include <linux/acct.h>
19#include <linux/capability.h> 19#include <linux/capability.h>
20#include <linux/cpumask.h>
20#include <linux/module.h> 21#include <linux/module.h>
21#include <linux/sysfs.h> 22#include <linux/sysfs.h>
22#include <linux/seq_file.h> 23#include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
55 return tmp & (HASH_SIZE - 1); 56 return tmp & (HASH_SIZE - 1);
56} 57}
57 58
59#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
60
58struct vfsmount *alloc_vfsmnt(const char *name) 61struct vfsmount *alloc_vfsmnt(const char *name)
59{ 62{
60 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 63 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
68 INIT_LIST_HEAD(&mnt->mnt_share); 71 INIT_LIST_HEAD(&mnt->mnt_share);
69 INIT_LIST_HEAD(&mnt->mnt_slave_list); 72 INIT_LIST_HEAD(&mnt->mnt_slave_list);
70 INIT_LIST_HEAD(&mnt->mnt_slave); 73 INIT_LIST_HEAD(&mnt->mnt_slave);
74 atomic_set(&mnt->__mnt_writers, 0);
71 if (name) { 75 if (name) {
72 int size = strlen(name) + 1; 76 int size = strlen(name) + 1;
73 char *newname = kmalloc(size, GFP_KERNEL); 77 char *newname = kmalloc(size, GFP_KERNEL);
@@ -80,6 +84,263 @@ struct vfsmount *alloc_vfsmnt(const char *name)
80 return mnt; 84 return mnt;
81} 85}
82 86
87/*
88 * Most r/o checks on a fs are for operations that take
89 * discrete amounts of time, like a write() or unlink().
90 * We must keep track of when those operations start
91 * (for permission checks) and when they end, so that
92 * we can determine when writes are able to occur to
93 * a filesystem.
94 */
95/*
96 * __mnt_is_readonly: check whether a mount is read-only
97 * @mnt: the mount to check for its write status
98 *
99 * This shouldn't be used directly ouside of the VFS.
100 * It does not guarantee that the filesystem will stay
101 * r/w, just that it is right *now*. This can not and
102 * should not be used in place of IS_RDONLY(inode).
103 * mnt_want/drop_write() will _keep_ the filesystem
104 * r/w.
105 */
106int __mnt_is_readonly(struct vfsmount *mnt)
107{
108 if (mnt->mnt_flags & MNT_READONLY)
109 return 1;
110 if (mnt->mnt_sb->s_flags & MS_RDONLY)
111 return 1;
112 return 0;
113}
114EXPORT_SYMBOL_GPL(__mnt_is_readonly);
115
116struct mnt_writer {
117 /*
118 * If holding multiple instances of this lock, they
119 * must be ordered by cpu number.
120 */
121 spinlock_t lock;
122 struct lock_class_key lock_class; /* compiles out with !lockdep */
123 unsigned long count;
124 struct vfsmount *mnt;
125} ____cacheline_aligned_in_smp;
126static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
127
128static int __init init_mnt_writers(void)
129{
130 int cpu;
131 for_each_possible_cpu(cpu) {
132 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
133 spin_lock_init(&writer->lock);
134 lockdep_set_class(&writer->lock, &writer->lock_class);
135 writer->count = 0;
136 }
137 return 0;
138}
139fs_initcall(init_mnt_writers);
140
141static void unlock_mnt_writers(void)
142{
143 int cpu;
144 struct mnt_writer *cpu_writer;
145
146 for_each_possible_cpu(cpu) {
147 cpu_writer = &per_cpu(mnt_writers, cpu);
148 spin_unlock(&cpu_writer->lock);
149 }
150}
151
152static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
153{
154 if (!cpu_writer->mnt)
155 return;
156 /*
157 * This is in case anyone ever leaves an invalid,
158 * old ->mnt and a count of 0.
159 */
160 if (!cpu_writer->count)
161 return;
162 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
163 cpu_writer->count = 0;
164}
165 /*
166 * must hold cpu_writer->lock
167 */
168static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
169 struct vfsmount *mnt)
170{
171 if (cpu_writer->mnt == mnt)
172 return;
173 __clear_mnt_count(cpu_writer);
174 cpu_writer->mnt = mnt;
175}
176
177/*
178 * Most r/o checks on a fs are for operations that take
179 * discrete amounts of time, like a write() or unlink().
180 * We must keep track of when those operations start
181 * (for permission checks) and when they end, so that
182 * we can determine when writes are able to occur to
183 * a filesystem.
184 */
185/**
186 * mnt_want_write - get write access to a mount
187 * @mnt: the mount on which to take a write
188 *
189 * This tells the low-level filesystem that a write is
190 * about to be performed to it, and makes sure that
191 * writes are allowed before returning success. When
192 * the write operation is finished, mnt_drop_write()
193 * must be called. This is effectively a refcount.
194 */
195int mnt_want_write(struct vfsmount *mnt)
196{
197 int ret = 0;
198 struct mnt_writer *cpu_writer;
199
200 cpu_writer = &get_cpu_var(mnt_writers);
201 spin_lock(&cpu_writer->lock);
202 if (__mnt_is_readonly(mnt)) {
203 ret = -EROFS;
204 goto out;
205 }
206 use_cpu_writer_for_mount(cpu_writer, mnt);
207 cpu_writer->count++;
208out:
209 spin_unlock(&cpu_writer->lock);
210 put_cpu_var(mnt_writers);
211 return ret;
212}
213EXPORT_SYMBOL_GPL(mnt_want_write);
214
215static void lock_mnt_writers(void)
216{
217 int cpu;
218 struct mnt_writer *cpu_writer;
219
220 for_each_possible_cpu(cpu) {
221 cpu_writer = &per_cpu(mnt_writers, cpu);
222 spin_lock(&cpu_writer->lock);
223 __clear_mnt_count(cpu_writer);
224 cpu_writer->mnt = NULL;
225 }
226}
227
228/*
229 * These per-cpu write counts are not guaranteed to have
230 * matched increments and decrements on any given cpu.
231 * A file open()ed for write on one cpu and close()d on
232 * another cpu will imbalance this count. Make sure it
233 * does not get too far out of whack.
234 */
235static void handle_write_count_underflow(struct vfsmount *mnt)
236{
237 if (atomic_read(&mnt->__mnt_writers) >=
238 MNT_WRITER_UNDERFLOW_LIMIT)
239 return;
240 /*
241 * It isn't necessary to hold all of the locks
242 * at the same time, but doing it this way makes
243 * us share a lot more code.
244 */
245 lock_mnt_writers();
246 /*
247 * vfsmount_lock is for mnt_flags.
248 */
249 spin_lock(&vfsmount_lock);
250 /*
251 * If coalescing the per-cpu writer counts did not
252 * get us back to a positive writer count, we have
253 * a bug.
254 */
255 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
256 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
257 printk(KERN_DEBUG "leak detected on mount(%p) writers "
258 "count: %d\n",
259 mnt, atomic_read(&mnt->__mnt_writers));
260 WARN_ON(1);
261 /* use the flag to keep the dmesg spam down */
262 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
263 }
264 spin_unlock(&vfsmount_lock);
265 unlock_mnt_writers();
266}
267
268/**
269 * mnt_drop_write - give up write access to a mount
270 * @mnt: the mount on which to give up write access
271 *
272 * Tells the low-level filesystem that we are done
273 * performing writes to it. Must be matched with
274 * mnt_want_write() call above.
275 */
276void mnt_drop_write(struct vfsmount *mnt)
277{
278 int must_check_underflow = 0;
279 struct mnt_writer *cpu_writer;
280
281 cpu_writer = &get_cpu_var(mnt_writers);
282 spin_lock(&cpu_writer->lock);
283
284 use_cpu_writer_for_mount(cpu_writer, mnt);
285 if (cpu_writer->count > 0) {
286 cpu_writer->count--;
287 } else {
288 must_check_underflow = 1;
289 atomic_dec(&mnt->__mnt_writers);
290 }
291
292 spin_unlock(&cpu_writer->lock);
293 /*
294 * Logically, we could call this each time,
295 * but the __mnt_writers cacheline tends to
296 * be cold, and makes this expensive.
297 */
298 if (must_check_underflow)
299 handle_write_count_underflow(mnt);
300 /*
301 * This could be done right after the spinlock
302 * is taken because the spinlock keeps us on
303 * the cpu, and disables preemption. However,
304 * putting it here bounds the amount that
305 * __mnt_writers can underflow. Without it,
306 * we could theoretically wrap __mnt_writers.
307 */
308 put_cpu_var(mnt_writers);
309}
310EXPORT_SYMBOL_GPL(mnt_drop_write);
311
312static int mnt_make_readonly(struct vfsmount *mnt)
313{
314 int ret = 0;
315
316 lock_mnt_writers();
317 /*
318 * With all the locks held, this value is stable
319 */
320 if (atomic_read(&mnt->__mnt_writers) > 0) {
321 ret = -EBUSY;
322 goto out;
323 }
324 /*
325 * nobody can do a successful mnt_want_write() with all
326 * of the counts in MNT_DENIED_WRITE and the locks held.
327 */
328 spin_lock(&vfsmount_lock);
329 if (!ret)
330 mnt->mnt_flags |= MNT_READONLY;
331 spin_unlock(&vfsmount_lock);
332out:
333 unlock_mnt_writers();
334 return ret;
335}
336
337static void __mnt_unmake_readonly(struct vfsmount *mnt)
338{
339 spin_lock(&vfsmount_lock);
340 mnt->mnt_flags &= ~MNT_READONLY;
341 spin_unlock(&vfsmount_lock);
342}
343
83int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 344int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
84{ 345{
85 mnt->mnt_sb = sb; 346 mnt->mnt_sb = sb;
@@ -155,15 +416,15 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
155 } 416 }
156} 417}
157 418
158static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) 419static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
159{ 420{
160 old_nd->path.dentry = mnt->mnt_mountpoint; 421 old_path->dentry = mnt->mnt_mountpoint;
161 old_nd->path.mnt = mnt->mnt_parent; 422 old_path->mnt = mnt->mnt_parent;
162 mnt->mnt_parent = mnt; 423 mnt->mnt_parent = mnt;
163 mnt->mnt_mountpoint = mnt->mnt_root; 424 mnt->mnt_mountpoint = mnt->mnt_root;
164 list_del_init(&mnt->mnt_child); 425 list_del_init(&mnt->mnt_child);
165 list_del_init(&mnt->mnt_hash); 426 list_del_init(&mnt->mnt_hash);
166 old_nd->path.dentry->d_mounted--; 427 old_path->dentry->d_mounted--;
167} 428}
168 429
169void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 430void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
@@ -174,12 +435,12 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
174 dentry->d_mounted++; 435 dentry->d_mounted++;
175} 436}
176 437
177static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd) 438static void attach_mnt(struct vfsmount *mnt, struct path *path)
178{ 439{
179 mnt_set_mountpoint(nd->path.mnt, nd->path.dentry, mnt); 440 mnt_set_mountpoint(path->mnt, path->dentry, mnt);
180 list_add_tail(&mnt->mnt_hash, mount_hashtable + 441 list_add_tail(&mnt->mnt_hash, mount_hashtable +
181 hash(nd->path.mnt, nd->path.dentry)); 442 hash(path->mnt, path->dentry));
182 list_add_tail(&mnt->mnt_child, &nd->path.mnt->mnt_mounts); 443 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
183} 444}
184 445
185/* 446/*
@@ -262,10 +523,8 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
262 /* stick the duplicate mount on the same expiry list 523 /* stick the duplicate mount on the same expiry list
263 * as the original if that was on one */ 524 * as the original if that was on one */
264 if (flag & CL_EXPIRE) { 525 if (flag & CL_EXPIRE) {
265 spin_lock(&vfsmount_lock);
266 if (!list_empty(&old->mnt_expire)) 526 if (!list_empty(&old->mnt_expire))
267 list_add(&mnt->mnt_expire, &old->mnt_expire); 527 list_add(&mnt->mnt_expire, &old->mnt_expire);
268 spin_unlock(&vfsmount_lock);
269 } 528 }
270 } 529 }
271 return mnt; 530 return mnt;
@@ -273,7 +532,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
273 532
274static inline void __mntput(struct vfsmount *mnt) 533static inline void __mntput(struct vfsmount *mnt)
275{ 534{
535 int cpu;
276 struct super_block *sb = mnt->mnt_sb; 536 struct super_block *sb = mnt->mnt_sb;
537 /*
538 * We don't have to hold all of the locks at the
539 * same time here because we know that we're the
540 * last reference to mnt and that no new writers
541 * can come in.
542 */
543 for_each_possible_cpu(cpu) {
544 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
545 if (cpu_writer->mnt != mnt)
546 continue;
547 spin_lock(&cpu_writer->lock);
548 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
549 cpu_writer->count = 0;
550 /*
551 * Might as well do this so that no one
552 * ever sees the pointer and expects
553 * it to be valid.
554 */
555 cpu_writer->mnt = NULL;
556 spin_unlock(&cpu_writer->lock);
557 }
558 /*
559 * This probably indicates that somebody messed
560 * up a mnt_want/drop_write() pair. If this
561 * happens, the filesystem was probably unable
562 * to make r/w->r/o transitions.
563 */
564 WARN_ON(atomic_read(&mnt->__mnt_writers));
277 dput(mnt->mnt_root); 565 dput(mnt->mnt_root);
278 free_vfsmnt(mnt); 566 free_vfsmnt(mnt);
279 deactivate_super(sb); 567 deactivate_super(sb);
@@ -419,7 +707,7 @@ static int show_vfsmnt(struct seq_file *m, void *v)
419 seq_putc(m, '.'); 707 seq_putc(m, '.');
420 mangle(m, mnt->mnt_sb->s_subtype); 708 mangle(m, mnt->mnt_sb->s_subtype);
421 } 709 }
422 seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); 710 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
423 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { 711 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
424 if (mnt->mnt_sb->s_flags & fs_infop->flag) 712 if (mnt->mnt_sb->s_flags & fs_infop->flag)
425 seq_puts(m, fs_infop->str); 713 seq_puts(m, fs_infop->str);
@@ -548,6 +836,7 @@ void release_mounts(struct list_head *head)
548 m = mnt->mnt_parent; 836 m = mnt->mnt_parent;
549 mnt->mnt_mountpoint = mnt->mnt_root; 837 mnt->mnt_mountpoint = mnt->mnt_root;
550 mnt->mnt_parent = mnt; 838 mnt->mnt_parent = mnt;
839 m->mnt_ghosts--;
551 spin_unlock(&vfsmount_lock); 840 spin_unlock(&vfsmount_lock);
552 dput(dentry); 841 dput(dentry);
553 mntput(m); 842 mntput(m);
@@ -572,12 +861,16 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
572 __touch_mnt_namespace(p->mnt_ns); 861 __touch_mnt_namespace(p->mnt_ns);
573 p->mnt_ns = NULL; 862 p->mnt_ns = NULL;
574 list_del_init(&p->mnt_child); 863 list_del_init(&p->mnt_child);
575 if (p->mnt_parent != p) 864 if (p->mnt_parent != p) {
865 p->mnt_parent->mnt_ghosts++;
576 p->mnt_mountpoint->d_mounted--; 866 p->mnt_mountpoint->d_mounted--;
867 }
577 change_mnt_propagation(p, MS_PRIVATE); 868 change_mnt_propagation(p, MS_PRIVATE);
578 } 869 }
579} 870}
580 871
872static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
873
581static int do_umount(struct vfsmount *mnt, int flags) 874static int do_umount(struct vfsmount *mnt, int flags)
582{ 875{
583 struct super_block *sb = mnt->mnt_sb; 876 struct super_block *sb = mnt->mnt_sb;
@@ -650,6 +943,9 @@ static int do_umount(struct vfsmount *mnt, int flags)
650 spin_lock(&vfsmount_lock); 943 spin_lock(&vfsmount_lock);
651 event++; 944 event++;
652 945
946 if (!(flags & MNT_DETACH))
947 shrink_submounts(mnt, &umount_list);
948
653 retval = -EBUSY; 949 retval = -EBUSY;
654 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 950 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
655 if (!list_empty(&mnt->mnt_list)) 951 if (!list_empty(&mnt->mnt_list))
@@ -744,7 +1040,7 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
744 int flag) 1040 int flag)
745{ 1041{
746 struct vfsmount *res, *p, *q, *r, *s; 1042 struct vfsmount *res, *p, *q, *r, *s;
747 struct nameidata nd; 1043 struct path path;
748 1044
749 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 1045 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
750 return NULL; 1046 return NULL;
@@ -769,14 +1065,14 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
769 q = q->mnt_parent; 1065 q = q->mnt_parent;
770 } 1066 }
771 p = s; 1067 p = s;
772 nd.path.mnt = q; 1068 path.mnt = q;
773 nd.path.dentry = p->mnt_mountpoint; 1069 path.dentry = p->mnt_mountpoint;
774 q = clone_mnt(p, p->mnt_root, flag); 1070 q = clone_mnt(p, p->mnt_root, flag);
775 if (!q) 1071 if (!q)
776 goto Enomem; 1072 goto Enomem;
777 spin_lock(&vfsmount_lock); 1073 spin_lock(&vfsmount_lock);
778 list_add_tail(&q->mnt_list, &res->mnt_list); 1074 list_add_tail(&q->mnt_list, &res->mnt_list);
779 attach_mnt(q, &nd); 1075 attach_mnt(q, &path);
780 spin_unlock(&vfsmount_lock); 1076 spin_unlock(&vfsmount_lock);
781 } 1077 }
782 } 1078 }
@@ -876,11 +1172,11 @@ void drop_collected_mounts(struct vfsmount *mnt)
876 * in allocations. 1172 * in allocations.
877 */ 1173 */
878static int attach_recursive_mnt(struct vfsmount *source_mnt, 1174static int attach_recursive_mnt(struct vfsmount *source_mnt,
879 struct nameidata *nd, struct nameidata *parent_nd) 1175 struct path *path, struct path *parent_path)
880{ 1176{
881 LIST_HEAD(tree_list); 1177 LIST_HEAD(tree_list);
882 struct vfsmount *dest_mnt = nd->path.mnt; 1178 struct vfsmount *dest_mnt = path->mnt;
883 struct dentry *dest_dentry = nd->path.dentry; 1179 struct dentry *dest_dentry = path->dentry;
884 struct vfsmount *child, *p; 1180 struct vfsmount *child, *p;
885 1181
886 if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) 1182 if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
@@ -892,9 +1188,9 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
892 } 1188 }
893 1189
894 spin_lock(&vfsmount_lock); 1190 spin_lock(&vfsmount_lock);
895 if (parent_nd) { 1191 if (parent_path) {
896 detach_mnt(source_mnt, parent_nd); 1192 detach_mnt(source_mnt, parent_path);
897 attach_mnt(source_mnt, nd); 1193 attach_mnt(source_mnt, path);
898 touch_mnt_namespace(current->nsproxy->mnt_ns); 1194 touch_mnt_namespace(current->nsproxy->mnt_ns);
899 } else { 1195 } else {
900 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 1196 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
@@ -930,7 +1226,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
930 1226
931 err = -ENOENT; 1227 err = -ENOENT;
932 if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry)) 1228 if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry))
933 err = attach_recursive_mnt(mnt, nd, NULL); 1229 err = attach_recursive_mnt(mnt, &nd->path, NULL);
934out_unlock: 1230out_unlock:
935 mutex_unlock(&nd->path.dentry->d_inode->i_mutex); 1231 mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
936 if (!err) 1232 if (!err)
@@ -1013,6 +1309,23 @@ out:
1013 return err; 1309 return err;
1014} 1310}
1015 1311
1312static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
1313{
1314 int error = 0;
1315 int readonly_request = 0;
1316
1317 if (ms_flags & MS_RDONLY)
1318 readonly_request = 1;
1319 if (readonly_request == __mnt_is_readonly(mnt))
1320 return 0;
1321
1322 if (readonly_request)
1323 error = mnt_make_readonly(mnt);
1324 else
1325 __mnt_unmake_readonly(mnt);
1326 return error;
1327}
1328
1016/* 1329/*
1017 * change filesystem flags. dir should be a physical root of filesystem. 1330 * change filesystem flags. dir should be a physical root of filesystem.
1018 * If you've mounted a non-root directory somewhere and want to do remount 1331 * If you've mounted a non-root directory somewhere and want to do remount
@@ -1035,7 +1348,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
1035 return -EINVAL; 1348 return -EINVAL;
1036 1349
1037 down_write(&sb->s_umount); 1350 down_write(&sb->s_umount);
1038 err = do_remount_sb(sb, flags, data, 0); 1351 if (flags & MS_BIND)
1352 err = change_mount_flags(nd->path.mnt, flags);
1353 else
1354 err = do_remount_sb(sb, flags, data, 0);
1039 if (!err) 1355 if (!err)
1040 nd->path.mnt->mnt_flags = mnt_flags; 1356 nd->path.mnt->mnt_flags = mnt_flags;
1041 up_write(&sb->s_umount); 1357 up_write(&sb->s_umount);
@@ -1059,7 +1375,8 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt)
1059 */ 1375 */
1060static noinline int do_move_mount(struct nameidata *nd, char *old_name) 1376static noinline int do_move_mount(struct nameidata *nd, char *old_name)
1061{ 1377{
1062 struct nameidata old_nd, parent_nd; 1378 struct nameidata old_nd;
1379 struct path parent_path;
1063 struct vfsmount *p; 1380 struct vfsmount *p;
1064 int err = 0; 1381 int err = 0;
1065 if (!capable(CAP_SYS_ADMIN)) 1382 if (!capable(CAP_SYS_ADMIN))
@@ -1114,21 +1431,19 @@ static noinline int do_move_mount(struct nameidata *nd, char *old_name)
1114 if (p == old_nd.path.mnt) 1431 if (p == old_nd.path.mnt)
1115 goto out1; 1432 goto out1;
1116 1433
1117 err = attach_recursive_mnt(old_nd.path.mnt, nd, &parent_nd); 1434 err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path);
1118 if (err) 1435 if (err)
1119 goto out1; 1436 goto out1;
1120 1437
1121 spin_lock(&vfsmount_lock);
1122 /* if the mount is moved, it should no longer be expire 1438 /* if the mount is moved, it should no longer be expire
1123 * automatically */ 1439 * automatically */
1124 list_del_init(&old_nd.path.mnt->mnt_expire); 1440 list_del_init(&old_nd.path.mnt->mnt_expire);
1125 spin_unlock(&vfsmount_lock);
1126out1: 1441out1:
1127 mutex_unlock(&nd->path.dentry->d_inode->i_mutex); 1442 mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
1128out: 1443out:
1129 up_write(&namespace_sem); 1444 up_write(&namespace_sem);
1130 if (!err) 1445 if (!err)
1131 path_put(&parent_nd.path); 1446 path_put(&parent_path);
1132 path_put(&old_nd.path); 1447 path_put(&old_nd.path);
1133 return err; 1448 return err;
1134} 1449}
@@ -1189,12 +1504,9 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
1189 if ((err = graft_tree(newmnt, nd))) 1504 if ((err = graft_tree(newmnt, nd)))
1190 goto unlock; 1505 goto unlock;
1191 1506
1192 if (fslist) { 1507 if (fslist) /* add to the specified expiration list */
1193 /* add to the specified expiration list */
1194 spin_lock(&vfsmount_lock);
1195 list_add_tail(&newmnt->mnt_expire, fslist); 1508 list_add_tail(&newmnt->mnt_expire, fslist);
1196 spin_unlock(&vfsmount_lock); 1509
1197 }
1198 up_write(&namespace_sem); 1510 up_write(&namespace_sem);
1199 return 0; 1511 return 0;
1200 1512
@@ -1206,75 +1518,6 @@ unlock:
1206 1518
1207EXPORT_SYMBOL_GPL(do_add_mount); 1519EXPORT_SYMBOL_GPL(do_add_mount);
1208 1520
1209static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
1210 struct list_head *umounts)
1211{
1212 spin_lock(&vfsmount_lock);
1213
1214 /*
1215 * Check if mount is still attached, if not, let whoever holds it deal
1216 * with the sucker
1217 */
1218 if (mnt->mnt_parent == mnt) {
1219 spin_unlock(&vfsmount_lock);
1220 return;
1221 }
1222
1223 /*
1224 * Check that it is still dead: the count should now be 2 - as
1225 * contributed by the vfsmount parent and the mntget above
1226 */
1227 if (!propagate_mount_busy(mnt, 2)) {
1228 /* delete from the namespace */
1229 touch_mnt_namespace(mnt->mnt_ns);
1230 list_del_init(&mnt->mnt_list);
1231 mnt->mnt_ns = NULL;
1232 umount_tree(mnt, 1, umounts);
1233 spin_unlock(&vfsmount_lock);
1234 } else {
1235 /*
1236 * Someone brought it back to life whilst we didn't have any
1237 * locks held so return it to the expiration list
1238 */
1239 list_add_tail(&mnt->mnt_expire, mounts);
1240 spin_unlock(&vfsmount_lock);
1241 }
1242}
1243
1244/*
1245 * go through the vfsmounts we've just consigned to the graveyard to
1246 * - check that they're still dead
1247 * - delete the vfsmount from the appropriate namespace under lock
1248 * - dispose of the corpse
1249 */
1250static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
1251{
1252 struct mnt_namespace *ns;
1253 struct vfsmount *mnt;
1254
1255 while (!list_empty(graveyard)) {
1256 LIST_HEAD(umounts);
1257 mnt = list_first_entry(graveyard, struct vfsmount, mnt_expire);
1258 list_del_init(&mnt->mnt_expire);
1259
1260 /* don't do anything if the namespace is dead - all the
1261 * vfsmounts from it are going away anyway */
1262 ns = mnt->mnt_ns;
1263 if (!ns || !ns->root)
1264 continue;
1265 get_mnt_ns(ns);
1266
1267 spin_unlock(&vfsmount_lock);
1268 down_write(&namespace_sem);
1269 expire_mount(mnt, mounts, &umounts);
1270 up_write(&namespace_sem);
1271 release_mounts(&umounts);
1272 mntput(mnt);
1273 put_mnt_ns(ns);
1274 spin_lock(&vfsmount_lock);
1275 }
1276}
1277
1278/* 1521/*
1279 * process a list of expirable mountpoints with the intent of discarding any 1522 * process a list of expirable mountpoints with the intent of discarding any
1280 * mountpoints that aren't in use and haven't been touched since last we came 1523 * mountpoints that aren't in use and haven't been touched since last we came
@@ -1284,10 +1527,12 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1284{ 1527{
1285 struct vfsmount *mnt, *next; 1528 struct vfsmount *mnt, *next;
1286 LIST_HEAD(graveyard); 1529 LIST_HEAD(graveyard);
1530 LIST_HEAD(umounts);
1287 1531
1288 if (list_empty(mounts)) 1532 if (list_empty(mounts))
1289 return; 1533 return;
1290 1534
1535 down_write(&namespace_sem);
1291 spin_lock(&vfsmount_lock); 1536 spin_lock(&vfsmount_lock);
1292 1537
1293 /* extract from the expiration list every vfsmount that matches the 1538 /* extract from the expiration list every vfsmount that matches the
@@ -1298,16 +1543,19 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1298 */ 1543 */
1299 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 1544 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
1300 if (!xchg(&mnt->mnt_expiry_mark, 1) || 1545 if (!xchg(&mnt->mnt_expiry_mark, 1) ||
1301 atomic_read(&mnt->mnt_count) != 1) 1546 propagate_mount_busy(mnt, 1))
1302 continue; 1547 continue;
1303
1304 mntget(mnt);
1305 list_move(&mnt->mnt_expire, &graveyard); 1548 list_move(&mnt->mnt_expire, &graveyard);
1306 } 1549 }
1307 1550 while (!list_empty(&graveyard)) {
1308 expire_mount_list(&graveyard, mounts); 1551 mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire);
1309 1552 touch_mnt_namespace(mnt->mnt_ns);
1553 umount_tree(mnt, 1, &umounts);
1554 }
1310 spin_unlock(&vfsmount_lock); 1555 spin_unlock(&vfsmount_lock);
1556 up_write(&namespace_sem);
1557
1558 release_mounts(&umounts);
1311} 1559}
1312 1560
1313EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 1561EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -1343,7 +1591,6 @@ resume:
1343 } 1591 }
1344 1592
1345 if (!propagate_mount_busy(mnt, 1)) { 1593 if (!propagate_mount_busy(mnt, 1)) {
1346 mntget(mnt);
1347 list_move_tail(&mnt->mnt_expire, graveyard); 1594 list_move_tail(&mnt->mnt_expire, graveyard);
1348 found++; 1595 found++;
1349 } 1596 }
@@ -1363,22 +1610,22 @@ resume:
1363 * process a list of expirable mountpoints with the intent of discarding any 1610 * process a list of expirable mountpoints with the intent of discarding any
1364 * submounts of a specific parent mountpoint 1611 * submounts of a specific parent mountpoint
1365 */ 1612 */
1366void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts) 1613static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
1367{ 1614{
1368 LIST_HEAD(graveyard); 1615 LIST_HEAD(graveyard);
1369 int found; 1616 struct vfsmount *m;
1370
1371 spin_lock(&vfsmount_lock);
1372 1617
1373 /* extract submounts of 'mountpoint' from the expiration list */ 1618 /* extract submounts of 'mountpoint' from the expiration list */
1374 while ((found = select_submounts(mountpoint, &graveyard)) != 0) 1619 while (select_submounts(mnt, &graveyard)) {
1375 expire_mount_list(&graveyard, mounts); 1620 while (!list_empty(&graveyard)) {
1376 1621 m = list_first_entry(&graveyard, struct vfsmount,
1377 spin_unlock(&vfsmount_lock); 1622 mnt_expire);
1623 touch_mnt_namespace(mnt->mnt_ns);
1624 umount_tree(mnt, 1, umounts);
1625 }
1626 }
1378} 1627}
1379 1628
1380EXPORT_SYMBOL_GPL(shrink_submounts);
1381
1382/* 1629/*
1383 * Some copy_from_user() implementations do not return the exact number of 1630 * Some copy_from_user() implementations do not return the exact number of
1384 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 1631 * bytes remaining to copy on a fault. But copy_mount_options() requires that.
@@ -1488,6 +1735,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1488 mnt_flags |= MNT_NODIRATIME; 1735 mnt_flags |= MNT_NODIRATIME;
1489 if (flags & MS_RELATIME) 1736 if (flags & MS_RELATIME)
1490 mnt_flags |= MNT_RELATIME; 1737 mnt_flags |= MNT_RELATIME;
1738 if (flags & MS_RDONLY)
1739 mnt_flags |= MNT_READONLY;
1491 1740
1492 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1741 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
1493 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); 1742 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
@@ -1683,7 +1932,7 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
1683 path_put(&old_pwd); 1932 path_put(&old_pwd);
1684} 1933}
1685 1934
1686static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) 1935static void chroot_fs_refs(struct path *old_root, struct path *new_root)
1687{ 1936{
1688 struct task_struct *g, *p; 1937 struct task_struct *g, *p;
1689 struct fs_struct *fs; 1938 struct fs_struct *fs;
@@ -1695,12 +1944,12 @@ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
1695 if (fs) { 1944 if (fs) {
1696 atomic_inc(&fs->count); 1945 atomic_inc(&fs->count);
1697 task_unlock(p); 1946 task_unlock(p);
1698 if (fs->root.dentry == old_nd->path.dentry 1947 if (fs->root.dentry == old_root->dentry
1699 && fs->root.mnt == old_nd->path.mnt) 1948 && fs->root.mnt == old_root->mnt)
1700 set_fs_root(fs, &new_nd->path); 1949 set_fs_root(fs, new_root);
1701 if (fs->pwd.dentry == old_nd->path.dentry 1950 if (fs->pwd.dentry == old_root->dentry
1702 && fs->pwd.mnt == old_nd->path.mnt) 1951 && fs->pwd.mnt == old_root->mnt)
1703 set_fs_pwd(fs, &new_nd->path); 1952 set_fs_pwd(fs, new_root);
1704 put_fs_struct(fs); 1953 put_fs_struct(fs);
1705 } else 1954 } else
1706 task_unlock(p); 1955 task_unlock(p);
@@ -1737,7 +1986,8 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1737 const char __user * put_old) 1986 const char __user * put_old)
1738{ 1987{
1739 struct vfsmount *tmp; 1988 struct vfsmount *tmp;
1740 struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; 1989 struct nameidata new_nd, old_nd, user_nd;
1990 struct path parent_path, root_parent;
1741 int error; 1991 int error;
1742 1992
1743 if (!capable(CAP_SYS_ADMIN)) 1993 if (!capable(CAP_SYS_ADMIN))
@@ -1811,19 +2061,19 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1811 goto out3; 2061 goto out3;
1812 } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry)) 2062 } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
1813 goto out3; 2063 goto out3;
1814 detach_mnt(new_nd.path.mnt, &parent_nd); 2064 detach_mnt(new_nd.path.mnt, &parent_path);
1815 detach_mnt(user_nd.path.mnt, &root_parent); 2065 detach_mnt(user_nd.path.mnt, &root_parent);
1816 /* mount old root on put_old */ 2066 /* mount old root on put_old */
1817 attach_mnt(user_nd.path.mnt, &old_nd); 2067 attach_mnt(user_nd.path.mnt, &old_nd.path);
1818 /* mount new_root on / */ 2068 /* mount new_root on / */
1819 attach_mnt(new_nd.path.mnt, &root_parent); 2069 attach_mnt(new_nd.path.mnt, &root_parent);
1820 touch_mnt_namespace(current->nsproxy->mnt_ns); 2070 touch_mnt_namespace(current->nsproxy->mnt_ns);
1821 spin_unlock(&vfsmount_lock); 2071 spin_unlock(&vfsmount_lock);
1822 chroot_fs_refs(&user_nd, &new_nd); 2072 chroot_fs_refs(&user_nd.path, &new_nd.path);
1823 security_sb_post_pivotroot(&user_nd, &new_nd); 2073 security_sb_post_pivotroot(&user_nd, &new_nd);
1824 error = 0; 2074 error = 0;
1825 path_put(&root_parent.path); 2075 path_put(&root_parent);
1826 path_put(&parent_nd.path); 2076 path_put(&parent_path);
1827out2: 2077out2:
1828 mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex); 2078 mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
1829 up_write(&namespace_sem); 2079 up_write(&namespace_sem);