aboutsummaryrefslogtreecommitdiffstats
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c419
1 files changed, 208 insertions, 211 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494158d9..7230787d18b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -22,6 +22,7 @@
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/mnt_namespace.h> 23#include <linux/mnt_namespace.h>
24#include <linux/namei.h> 24#include <linux/namei.h>
25#include <linux/nsproxy.h>
25#include <linux/security.h> 26#include <linux/security.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include <linux/ramfs.h> 28#include <linux/ramfs.h>
@@ -42,6 +43,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
42static int event; 43static int event;
43static DEFINE_IDA(mnt_id_ida); 44static DEFINE_IDA(mnt_id_ida);
44static DEFINE_IDA(mnt_group_ida); 45static DEFINE_IDA(mnt_group_ida);
46static int mnt_id_start = 0;
47static int mnt_group_start = 1;
45 48
46static struct list_head *mount_hashtable __read_mostly; 49static struct list_head *mount_hashtable __read_mostly;
47static struct kmem_cache *mnt_cache __read_mostly; 50static struct kmem_cache *mnt_cache __read_mostly;
@@ -69,7 +72,9 @@ static int mnt_alloc_id(struct vfsmount *mnt)
69retry: 72retry:
70 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 73 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
71 spin_lock(&vfsmount_lock); 74 spin_lock(&vfsmount_lock);
72 res = ida_get_new(&mnt_id_ida, &mnt->mnt_id); 75 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
76 if (!res)
77 mnt_id_start = mnt->mnt_id + 1;
73 spin_unlock(&vfsmount_lock); 78 spin_unlock(&vfsmount_lock);
74 if (res == -EAGAIN) 79 if (res == -EAGAIN)
75 goto retry; 80 goto retry;
@@ -79,8 +84,11 @@ retry:
79 84
80static void mnt_free_id(struct vfsmount *mnt) 85static void mnt_free_id(struct vfsmount *mnt)
81{ 86{
87 int id = mnt->mnt_id;
82 spin_lock(&vfsmount_lock); 88 spin_lock(&vfsmount_lock);
83 ida_remove(&mnt_id_ida, mnt->mnt_id); 89 ida_remove(&mnt_id_ida, id);
90 if (mnt_id_start > id)
91 mnt_id_start = id;
84 spin_unlock(&vfsmount_lock); 92 spin_unlock(&vfsmount_lock);
85} 93}
86 94
@@ -91,10 +99,18 @@ static void mnt_free_id(struct vfsmount *mnt)
91 */ 99 */
92static int mnt_alloc_group_id(struct vfsmount *mnt) 100static int mnt_alloc_group_id(struct vfsmount *mnt)
93{ 101{
102 int res;
103
94 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 104 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
95 return -ENOMEM; 105 return -ENOMEM;
96 106
97 return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id); 107 res = ida_get_new_above(&mnt_group_ida,
108 mnt_group_start,
109 &mnt->mnt_group_id);
110 if (!res)
111 mnt_group_start = mnt->mnt_group_id + 1;
112
113 return res;
98} 114}
99 115
100/* 116/*
@@ -102,7 +118,10 @@ static int mnt_alloc_group_id(struct vfsmount *mnt)
102 */ 118 */
103void mnt_release_group_id(struct vfsmount *mnt) 119void mnt_release_group_id(struct vfsmount *mnt)
104{ 120{
105 ida_remove(&mnt_group_ida, mnt->mnt_group_id); 121 int id = mnt->mnt_group_id;
122 ida_remove(&mnt_group_ida, id);
123 if (mnt_group_start > id)
124 mnt_group_start = id;
106 mnt->mnt_group_id = 0; 125 mnt->mnt_group_id = 0;
107} 126}
108 127
@@ -131,10 +150,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
131 INIT_LIST_HEAD(&mnt->mnt_share); 150 INIT_LIST_HEAD(&mnt->mnt_share);
132 INIT_LIST_HEAD(&mnt->mnt_slave_list); 151 INIT_LIST_HEAD(&mnt->mnt_slave_list);
133 INIT_LIST_HEAD(&mnt->mnt_slave); 152 INIT_LIST_HEAD(&mnt->mnt_slave);
134 atomic_set(&mnt->__mnt_writers, 0); 153#ifdef CONFIG_SMP
154 mnt->mnt_writers = alloc_percpu(int);
155 if (!mnt->mnt_writers)
156 goto out_free_devname;
157#else
158 mnt->mnt_writers = 0;
159#endif
135 } 160 }
136 return mnt; 161 return mnt;
137 162
163#ifdef CONFIG_SMP
164out_free_devname:
165 kfree(mnt->mnt_devname);
166#endif
138out_free_id: 167out_free_id:
139 mnt_free_id(mnt); 168 mnt_free_id(mnt);
140out_free_cache: 169out_free_cache:
@@ -171,65 +200,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
171} 200}
172EXPORT_SYMBOL_GPL(__mnt_is_readonly); 201EXPORT_SYMBOL_GPL(__mnt_is_readonly);
173 202
174struct mnt_writer { 203static inline void inc_mnt_writers(struct vfsmount *mnt)
175 /* 204{
176 * If holding multiple instances of this lock, they 205#ifdef CONFIG_SMP
177 * must be ordered by cpu number. 206 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
178 */ 207#else
179 spinlock_t lock; 208 mnt->mnt_writers++;
180 struct lock_class_key lock_class; /* compiles out with !lockdep */ 209#endif
181 unsigned long count; 210}
182 struct vfsmount *mnt;
183} ____cacheline_aligned_in_smp;
184static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
185 211
186static int __init init_mnt_writers(void) 212static inline void dec_mnt_writers(struct vfsmount *mnt)
187{ 213{
188 int cpu; 214#ifdef CONFIG_SMP
189 for_each_possible_cpu(cpu) { 215 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
190 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); 216#else
191 spin_lock_init(&writer->lock); 217 mnt->mnt_writers--;
192 lockdep_set_class(&writer->lock, &writer->lock_class); 218#endif
193 writer->count = 0;
194 }
195 return 0;
196} 219}
197fs_initcall(init_mnt_writers);
198 220
199static void unlock_mnt_writers(void) 221static unsigned int count_mnt_writers(struct vfsmount *mnt)
200{ 222{
223#ifdef CONFIG_SMP
224 unsigned int count = 0;
201 int cpu; 225 int cpu;
202 struct mnt_writer *cpu_writer;
203 226
204 for_each_possible_cpu(cpu) { 227 for_each_possible_cpu(cpu) {
205 cpu_writer = &per_cpu(mnt_writers, cpu); 228 count += *per_cpu_ptr(mnt->mnt_writers, cpu);
206 spin_unlock(&cpu_writer->lock);
207 } 229 }
208}
209 230
210static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) 231 return count;
211{ 232#else
212 if (!cpu_writer->mnt) 233 return mnt->mnt_writers;
213 return; 234#endif
214 /*
215 * This is in case anyone ever leaves an invalid,
216 * old ->mnt and a count of 0.
217 */
218 if (!cpu_writer->count)
219 return;
220 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
221 cpu_writer->count = 0;
222}
223 /*
224 * must hold cpu_writer->lock
225 */
226static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
227 struct vfsmount *mnt)
228{
229 if (cpu_writer->mnt == mnt)
230 return;
231 __clear_mnt_count(cpu_writer);
232 cpu_writer->mnt = mnt;
233} 235}
234 236
235/* 237/*
@@ -253,74 +255,74 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
253int mnt_want_write(struct vfsmount *mnt) 255int mnt_want_write(struct vfsmount *mnt)
254{ 256{
255 int ret = 0; 257 int ret = 0;
256 struct mnt_writer *cpu_writer;
257 258
258 cpu_writer = &get_cpu_var(mnt_writers); 259 preempt_disable();
259 spin_lock(&cpu_writer->lock); 260 inc_mnt_writers(mnt);
261 /*
262 * The store to inc_mnt_writers must be visible before we pass
263 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
264 * incremented count after it has set MNT_WRITE_HOLD.
265 */
266 smp_mb();
267 while (mnt->mnt_flags & MNT_WRITE_HOLD)
268 cpu_relax();
269 /*
270 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
271 * be set to match its requirements. So we must not load that until
272 * MNT_WRITE_HOLD is cleared.
273 */
274 smp_rmb();
260 if (__mnt_is_readonly(mnt)) { 275 if (__mnt_is_readonly(mnt)) {
276 dec_mnt_writers(mnt);
261 ret = -EROFS; 277 ret = -EROFS;
262 goto out; 278 goto out;
263 } 279 }
264 use_cpu_writer_for_mount(cpu_writer, mnt);
265 cpu_writer->count++;
266out: 280out:
267 spin_unlock(&cpu_writer->lock); 281 preempt_enable();
268 put_cpu_var(mnt_writers);
269 return ret; 282 return ret;
270} 283}
271EXPORT_SYMBOL_GPL(mnt_want_write); 284EXPORT_SYMBOL_GPL(mnt_want_write);
272 285
273static void lock_mnt_writers(void) 286/**
274{ 287 * mnt_clone_write - get write access to a mount
275 int cpu; 288 * @mnt: the mount on which to take a write
276 struct mnt_writer *cpu_writer; 289 *
277 290 * This is effectively like mnt_want_write, except
278 for_each_possible_cpu(cpu) { 291 * it must only be used to take an extra write reference
279 cpu_writer = &per_cpu(mnt_writers, cpu); 292 * on a mountpoint that we already know has a write reference
280 spin_lock(&cpu_writer->lock); 293 * on it. This allows some optimisation.
281 __clear_mnt_count(cpu_writer); 294 *
282 cpu_writer->mnt = NULL; 295 * After finished, mnt_drop_write must be called as usual to
283 } 296 * drop the reference.
297 */
298int mnt_clone_write(struct vfsmount *mnt)
299{
300 /* superblock may be r/o */
301 if (__mnt_is_readonly(mnt))
302 return -EROFS;
303 preempt_disable();
304 inc_mnt_writers(mnt);
305 preempt_enable();
306 return 0;
284} 307}
308EXPORT_SYMBOL_GPL(mnt_clone_write);
285 309
286/* 310/**
287 * These per-cpu write counts are not guaranteed to have 311 * mnt_want_write_file - get write access to a file's mount
288 * matched increments and decrements on any given cpu. 312 * @file: the file who's mount on which to take a write
289 * A file open()ed for write on one cpu and close()d on 313 *
290 * another cpu will imbalance this count. Make sure it 314 * This is like mnt_want_write, but it takes a file and can
291 * does not get too far out of whack. 315 * do some optimisations if the file is open for write already
292 */ 316 */
293static void handle_write_count_underflow(struct vfsmount *mnt) 317int mnt_want_write_file(struct file *file)
294{ 318{
295 if (atomic_read(&mnt->__mnt_writers) >= 319 struct inode *inode = file->f_dentry->d_inode;
296 MNT_WRITER_UNDERFLOW_LIMIT) 320 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
297 return; 321 return mnt_want_write(file->f_path.mnt);
298 /* 322 else
299 * It isn't necessary to hold all of the locks 323 return mnt_clone_write(file->f_path.mnt);
300 * at the same time, but doing it this way makes
301 * us share a lot more code.
302 */
303 lock_mnt_writers();
304 /*
305 * vfsmount_lock is for mnt_flags.
306 */
307 spin_lock(&vfsmount_lock);
308 /*
309 * If coalescing the per-cpu writer counts did not
310 * get us back to a positive writer count, we have
311 * a bug.
312 */
313 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
314 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
315 WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
316 "count: %d\n",
317 mnt, atomic_read(&mnt->__mnt_writers));
318 /* use the flag to keep the dmesg spam down */
319 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
320 }
321 spin_unlock(&vfsmount_lock);
322 unlock_mnt_writers();
323} 324}
325EXPORT_SYMBOL_GPL(mnt_want_write_file);
324 326
325/** 327/**
326 * mnt_drop_write - give up write access to a mount 328 * mnt_drop_write - give up write access to a mount
@@ -332,37 +334,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
332 */ 334 */
333void mnt_drop_write(struct vfsmount *mnt) 335void mnt_drop_write(struct vfsmount *mnt)
334{ 336{
335 int must_check_underflow = 0; 337 preempt_disable();
336 struct mnt_writer *cpu_writer; 338 dec_mnt_writers(mnt);
337 339 preempt_enable();
338 cpu_writer = &get_cpu_var(mnt_writers);
339 spin_lock(&cpu_writer->lock);
340
341 use_cpu_writer_for_mount(cpu_writer, mnt);
342 if (cpu_writer->count > 0) {
343 cpu_writer->count--;
344 } else {
345 must_check_underflow = 1;
346 atomic_dec(&mnt->__mnt_writers);
347 }
348
349 spin_unlock(&cpu_writer->lock);
350 /*
351 * Logically, we could call this each time,
352 * but the __mnt_writers cacheline tends to
353 * be cold, and makes this expensive.
354 */
355 if (must_check_underflow)
356 handle_write_count_underflow(mnt);
357 /*
358 * This could be done right after the spinlock
359 * is taken because the spinlock keeps us on
360 * the cpu, and disables preemption. However,
361 * putting it here bounds the amount that
362 * __mnt_writers can underflow. Without it,
363 * we could theoretically wrap __mnt_writers.
364 */
365 put_cpu_var(mnt_writers);
366} 340}
367EXPORT_SYMBOL_GPL(mnt_drop_write); 341EXPORT_SYMBOL_GPL(mnt_drop_write);
368 342
@@ -370,24 +344,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
370{ 344{
371 int ret = 0; 345 int ret = 0;
372 346
373 lock_mnt_writers(); 347 spin_lock(&vfsmount_lock);
348 mnt->mnt_flags |= MNT_WRITE_HOLD;
374 /* 349 /*
375 * With all the locks held, this value is stable 350 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
351 * should be visible before we do.
376 */ 352 */
377 if (atomic_read(&mnt->__mnt_writers) > 0) { 353 smp_mb();
378 ret = -EBUSY; 354
379 goto out;
380 }
381 /* 355 /*
382 * nobody can do a successful mnt_want_write() with all 356 * With writers on hold, if this value is zero, then there are
383 * of the counts in MNT_DENIED_WRITE and the locks held. 357 * definitely no active writers (although held writers may subsequently
358 * increment the count, they'll have to wait, and decrement it after
359 * seeing MNT_READONLY).
360 *
361 * It is OK to have counter incremented on one CPU and decremented on
362 * another: the sum will add up correctly. The danger would be when we
363 * sum up each counter, if we read a counter before it is incremented,
364 * but then read another CPU's count which it has been subsequently
365 * decremented from -- we would see more decrements than we should.
366 * MNT_WRITE_HOLD protects against this scenario, because
367 * mnt_want_write first increments count, then smp_mb, then spins on
368 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
369 * we're counting up here.
384 */ 370 */
385 spin_lock(&vfsmount_lock); 371 if (count_mnt_writers(mnt) > 0)
386 if (!ret) 372 ret = -EBUSY;
373 else
387 mnt->mnt_flags |= MNT_READONLY; 374 mnt->mnt_flags |= MNT_READONLY;
375 /*
376 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
377 * that become unheld will see MNT_READONLY.
378 */
379 smp_wmb();
380 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
388 spin_unlock(&vfsmount_lock); 381 spin_unlock(&vfsmount_lock);
389out:
390 unlock_mnt_writers();
391 return ret; 382 return ret;
392} 383}
393 384
@@ -410,6 +401,9 @@ void free_vfsmnt(struct vfsmount *mnt)
410{ 401{
411 kfree(mnt->mnt_devname); 402 kfree(mnt->mnt_devname);
412 mnt_free_id(mnt); 403 mnt_free_id(mnt);
404#ifdef CONFIG_SMP
405 free_percpu(mnt->mnt_writers);
406#endif
413 kmem_cache_free(mnt_cache, mnt); 407 kmem_cache_free(mnt_cache, mnt);
414} 408}
415 409
@@ -442,11 +436,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
442 * lookup_mnt increments the ref count before returning 436 * lookup_mnt increments the ref count before returning
443 * the vfsmount struct. 437 * the vfsmount struct.
444 */ 438 */
445struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 439struct vfsmount *lookup_mnt(struct path *path)
446{ 440{
447 struct vfsmount *child_mnt; 441 struct vfsmount *child_mnt;
448 spin_lock(&vfsmount_lock); 442 spin_lock(&vfsmount_lock);
449 if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) 443 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
450 mntget(child_mnt); 444 mntget(child_mnt);
451 spin_unlock(&vfsmount_lock); 445 spin_unlock(&vfsmount_lock);
452 return child_mnt; 446 return child_mnt;
@@ -604,38 +598,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
604 598
605static inline void __mntput(struct vfsmount *mnt) 599static inline void __mntput(struct vfsmount *mnt)
606{ 600{
607 int cpu;
608 struct super_block *sb = mnt->mnt_sb; 601 struct super_block *sb = mnt->mnt_sb;
609 /* 602 /*
610 * We don't have to hold all of the locks at the
611 * same time here because we know that we're the
612 * last reference to mnt and that no new writers
613 * can come in.
614 */
615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 spin_lock(&cpu_writer->lock);
618 if (cpu_writer->mnt != mnt) {
619 spin_unlock(&cpu_writer->lock);
620 continue;
621 }
622 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
623 cpu_writer->count = 0;
624 /*
625 * Might as well do this so that no one
626 * ever sees the pointer and expects
627 * it to be valid.
628 */
629 cpu_writer->mnt = NULL;
630 spin_unlock(&cpu_writer->lock);
631 }
632 /*
633 * This probably indicates that somebody messed 603 * This probably indicates that somebody messed
634 * up a mnt_want/drop_write() pair. If this 604 * up a mnt_want/drop_write() pair. If this
635 * happens, the filesystem was probably unable 605 * happens, the filesystem was probably unable
636 * to make r/w->r/o transitions. 606 * to make r/w->r/o transitions.
637 */ 607 */
638 WARN_ON(atomic_read(&mnt->__mnt_writers)); 608 /*
609 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
610 * provides barriers, so count_mnt_writers() below is safe. AV
611 */
612 WARN_ON(count_mnt_writers(mnt));
639 dput(mnt->mnt_root); 613 dput(mnt->mnt_root);
640 free_vfsmnt(mnt); 614 free_vfsmnt(mnt);
641 deactivate_super(sb); 615 deactivate_super(sb);
@@ -1106,11 +1080,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
1106 * we just try to remount it readonly. 1080 * we just try to remount it readonly.
1107 */ 1081 */
1108 down_write(&sb->s_umount); 1082 down_write(&sb->s_umount);
1109 if (!(sb->s_flags & MS_RDONLY)) { 1083 if (!(sb->s_flags & MS_RDONLY))
1110 lock_kernel();
1111 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1084 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
1112 unlock_kernel();
1113 }
1114 up_write(&sb->s_umount); 1085 up_write(&sb->s_umount);
1115 return retval; 1086 return retval;
1116 } 1087 }
@@ -1253,11 +1224,11 @@ Enomem:
1253 return NULL; 1224 return NULL;
1254} 1225}
1255 1226
1256struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) 1227struct vfsmount *collect_mounts(struct path *path)
1257{ 1228{
1258 struct vfsmount *tree; 1229 struct vfsmount *tree;
1259 down_write(&namespace_sem); 1230 down_write(&namespace_sem);
1260 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); 1231 tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
1261 up_write(&namespace_sem); 1232 up_write(&namespace_sem);
1262 return tree; 1233 return tree;
1263} 1234}
@@ -1430,7 +1401,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1430 goto out_unlock; 1401 goto out_unlock;
1431 1402
1432 err = -ENOENT; 1403 err = -ENOENT;
1433 if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) 1404 if (!d_unlinked(path->dentry))
1434 err = attach_recursive_mnt(mnt, path, NULL); 1405 err = attach_recursive_mnt(mnt, path, NULL);
1435out_unlock: 1406out_unlock:
1436 mutex_unlock(&path->dentry->d_inode->i_mutex); 1407 mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1601,7 +1572,7 @@ static int do_move_mount(struct path *path, char *old_name)
1601 1572
1602 down_write(&namespace_sem); 1573 down_write(&namespace_sem);
1603 while (d_mountpoint(path->dentry) && 1574 while (d_mountpoint(path->dentry) &&
1604 follow_down(&path->mnt, &path->dentry)) 1575 follow_down(path))
1605 ; 1576 ;
1606 err = -EINVAL; 1577 err = -EINVAL;
1607 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1578 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1612,7 +1583,7 @@ static int do_move_mount(struct path *path, char *old_name)
1612 if (IS_DEADDIR(path->dentry->d_inode)) 1583 if (IS_DEADDIR(path->dentry->d_inode))
1613 goto out1; 1584 goto out1;
1614 1585
1615 if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) 1586 if (d_unlinked(path->dentry))
1616 goto out1; 1587 goto out1;
1617 1588
1618 err = -EINVAL; 1589 err = -EINVAL;
@@ -1676,7 +1647,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
1676 if (!capable(CAP_SYS_ADMIN)) 1647 if (!capable(CAP_SYS_ADMIN))
1677 return -EPERM; 1648 return -EPERM;
1678 1649
1650 lock_kernel();
1679 mnt = do_kern_mount(type, flags, name, data); 1651 mnt = do_kern_mount(type, flags, name, data);
1652 unlock_kernel();
1680 if (IS_ERR(mnt)) 1653 if (IS_ERR(mnt))
1681 return PTR_ERR(mnt); 1654 return PTR_ERR(mnt);
1682 1655
@@ -1695,10 +1668,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1695 down_write(&namespace_sem); 1668 down_write(&namespace_sem);
1696 /* Something was mounted here while we slept */ 1669 /* Something was mounted here while we slept */
1697 while (d_mountpoint(path->dentry) && 1670 while (d_mountpoint(path->dentry) &&
1698 follow_down(&path->mnt, &path->dentry)) 1671 follow_down(path))
1699 ; 1672 ;
1700 err = -EINVAL; 1673 err = -EINVAL;
1701 if (!check_mnt(path->mnt)) 1674 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1702 goto unlock; 1675 goto unlock;
1703 1676
1704 /* Refuse the same filesystem on the same mount point */ 1677 /* Refuse the same filesystem on the same mount point */
@@ -1984,6 +1957,21 @@ dput_out:
1984 return retval; 1957 return retval;
1985} 1958}
1986 1959
1960static struct mnt_namespace *alloc_mnt_ns(void)
1961{
1962 struct mnt_namespace *new_ns;
1963
1964 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
1965 if (!new_ns)
1966 return ERR_PTR(-ENOMEM);
1967 atomic_set(&new_ns->count, 1);
1968 new_ns->root = NULL;
1969 INIT_LIST_HEAD(&new_ns->list);
1970 init_waitqueue_head(&new_ns->poll);
1971 new_ns->event = 0;
1972 return new_ns;
1973}
1974
1987/* 1975/*
1988 * Allocate a new namespace structure and populate it with contents 1976 * Allocate a new namespace structure and populate it with contents
1989 * copied from the namespace of the passed in task structure. 1977 * copied from the namespace of the passed in task structure.
@@ -1995,14 +1983,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1995 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 1983 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
1996 struct vfsmount *p, *q; 1984 struct vfsmount *p, *q;
1997 1985
1998 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 1986 new_ns = alloc_mnt_ns();
1999 if (!new_ns) 1987 if (IS_ERR(new_ns))
2000 return ERR_PTR(-ENOMEM); 1988 return new_ns;
2001
2002 atomic_set(&new_ns->count, 1);
2003 INIT_LIST_HEAD(&new_ns->list);
2004 init_waitqueue_head(&new_ns->poll);
2005 new_ns->event = 0;
2006 1989
2007 down_write(&namespace_sem); 1990 down_write(&namespace_sem);
2008 /* First pass: copy the tree topology */ 1991 /* First pass: copy the tree topology */
@@ -2066,6 +2049,24 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2066 return new_ns; 2049 return new_ns;
2067} 2050}
2068 2051
2052/**
2053 * create_mnt_ns - creates a private namespace and adds a root filesystem
2054 * @mnt: pointer to the new root filesystem mountpoint
2055 */
2056struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2057{
2058 struct mnt_namespace *new_ns;
2059
2060 new_ns = alloc_mnt_ns();
2061 if (!IS_ERR(new_ns)) {
2062 mnt->mnt_ns = new_ns;
2063 new_ns->root = mnt;
2064 list_add(&new_ns->list, &new_ns->root->mnt_list);
2065 }
2066 return new_ns;
2067}
2068EXPORT_SYMBOL(create_mnt_ns);
2069
2069SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2070SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2070 char __user *, type, unsigned long, flags, void __user *, data) 2071 char __user *, type, unsigned long, flags, void __user *, data)
2071{ 2072{
@@ -2092,10 +2093,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2092 if (retval < 0) 2093 if (retval < 0)
2093 goto out3; 2094 goto out3;
2094 2095
2095 lock_kernel();
2096 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 2096 retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
2097 flags, (void *)data_page); 2097 flags, (void *)data_page);
2098 unlock_kernel();
2099 free_page(data_page); 2098 free_page(data_page);
2100 2099
2101out3: 2100out3:
@@ -2175,9 +2174,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2175 error = -ENOENT; 2174 error = -ENOENT;
2176 if (IS_DEADDIR(new.dentry->d_inode)) 2175 if (IS_DEADDIR(new.dentry->d_inode))
2177 goto out2; 2176 goto out2;
2178 if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry)) 2177 if (d_unlinked(new.dentry))
2179 goto out2; 2178 goto out2;
2180 if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry)) 2179 if (d_unlinked(old.dentry))
2181 goto out2; 2180 goto out2;
2182 error = -EBUSY; 2181 error = -EBUSY;
2183 if (new.mnt == root.mnt || 2182 if (new.mnt == root.mnt ||
@@ -2243,16 +2242,9 @@ static void __init init_mount_tree(void)
2243 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2242 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
2244 if (IS_ERR(mnt)) 2243 if (IS_ERR(mnt))
2245 panic("Can't create rootfs"); 2244 panic("Can't create rootfs");
2246 ns = kmalloc(sizeof(*ns), GFP_KERNEL); 2245 ns = create_mnt_ns(mnt);
2247 if (!ns) 2246 if (IS_ERR(ns))
2248 panic("Can't allocate initial namespace"); 2247 panic("Can't allocate initial namespace");
2249 atomic_set(&ns->count, 1);
2250 INIT_LIST_HEAD(&ns->list);
2251 init_waitqueue_head(&ns->poll);
2252 ns->event = 0;
2253 list_add(&mnt->mnt_list, &ns->list);
2254 ns->root = mnt;
2255 mnt->mnt_ns = ns;
2256 2248
2257 init_task.nsproxy->mnt_ns = ns; 2249 init_task.nsproxy->mnt_ns = ns;
2258 get_mnt_ns(ns); 2250 get_mnt_ns(ns);
@@ -2295,10 +2287,14 @@ void __init mnt_init(void)
2295 init_mount_tree(); 2287 init_mount_tree();
2296} 2288}
2297 2289
2298void __put_mnt_ns(struct mnt_namespace *ns) 2290void put_mnt_ns(struct mnt_namespace *ns)
2299{ 2291{
2300 struct vfsmount *root = ns->root; 2292 struct vfsmount *root;
2301 LIST_HEAD(umount_list); 2293 LIST_HEAD(umount_list);
2294
2295 if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
2296 return;
2297 root = ns->root;
2302 ns->root = NULL; 2298 ns->root = NULL;
2303 spin_unlock(&vfsmount_lock); 2299 spin_unlock(&vfsmount_lock);
2304 down_write(&namespace_sem); 2300 down_write(&namespace_sem);
@@ -2309,3 +2305,4 @@ void __put_mnt_ns(struct mnt_namespace *ns)
2309 release_mounts(&umount_list); 2305 release_mounts(&umount_list);
2310 kfree(ns); 2306 kfree(ns);
2311} 2307}
2308EXPORT_SYMBOL(put_mnt_ns);