aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2012-08-01 13:33:47 -0400
committerEric W. Biederman <ebiederm@xmission.com>2012-11-19 08:59:10 -0500
commit0a01f2cc390e10633a54f72c608cc3fe19a50c3d (patch)
treee713a1c45b5ce125a5d33b61d528cd45264d47a7
parent17cf22c33e1f1b5e435469c84e43872579497653 (diff)
pidns: Make the pidns proc mount/umount logic obvious.
Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" <serge@hallyn.com> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/root.c5
-rw-r--r--include/linux/pid_namespace.h2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/pid.c21
-rw-r--r--kernel/pid_namespace.c14
6 files changed, 26 insertions, 22 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6177fc238fdb..7621dc51cff8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task)
2590 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2590 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2591 tgid->numbers[i].nr); 2591 tgid->numbers[i].nr);
2592 } 2592 }
2593
2594 upid = &pid->numbers[pid->level];
2595 if (upid->nr == 1)
2596 pid_ns_release_proc(upid->ns);
2597} 2593}
2598 2594
2599static struct dentry *proc_pid_instantiate(struct inode *dir, 2595static struct dentry *proc_pid_instantiate(struct inode *dir,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index fc1609321a78..f2f251158d35 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -155,11 +155,6 @@ void __init proc_root_init(void)
155 err = register_filesystem(&proc_fs_type); 155 err = register_filesystem(&proc_fs_type);
156 if (err) 156 if (err)
157 return; 157 return;
158 err = pid_ns_prepare_proc(&init_pid_ns);
159 if (err) {
160 unregister_filesystem(&proc_fs_type);
161 return;
162 }
163 158
164 proc_self_init(); 159 proc_self_init();
165 proc_symlink("mounts", NULL, "self/mounts"); 160 proc_symlink("mounts", NULL, "self/mounts");
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c89c9cfcd247..4c96acdb2489 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -21,6 +21,7 @@ struct pid_namespace {
21 struct kref kref; 21 struct kref kref;
22 struct pidmap pidmap[PIDMAP_ENTRIES]; 22 struct pidmap pidmap[PIDMAP_ENTRIES];
23 int last_pid; 23 int last_pid;
24 int nr_hashed;
24 struct task_struct *child_reaper; 25 struct task_struct *child_reaper;
25 struct kmem_cache *pid_cachep; 26 struct kmem_cache *pid_cachep;
26 unsigned int level; 27 unsigned int level;
@@ -32,6 +33,7 @@ struct pid_namespace {
32 struct bsd_acct_struct *bacct; 33 struct bsd_acct_struct *bacct;
33#endif 34#endif
34 struct user_namespace *user_ns; 35 struct user_namespace *user_ns;
36 struct work_struct proc_work;
35 kgid_t pid_gid; 37 kgid_t pid_gid;
36 int hide_pid; 38 int hide_pid;
37 int reboot; /* group exit code if this pidns was rebooted */ 39 int reboot; /* group exit code if this pidns was rebooted */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7798c247f4b9..666dc8b06606 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1476,8 +1476,6 @@ bad_fork_cleanup_io:
1476 if (p->io_context) 1476 if (p->io_context)
1477 exit_io_context(p); 1477 exit_io_context(p);
1478bad_fork_cleanup_namespaces: 1478bad_fork_cleanup_namespaces:
1479 if (unlikely(clone_flags & CLONE_NEWPID))
1480 pid_ns_release_proc(p->nsproxy->pid_ns);
1481 exit_task_namespaces(p); 1479 exit_task_namespaces(p);
1482bad_fork_cleanup_mm: 1480bad_fork_cleanup_mm:
1483 if (p->mm) 1481 if (p->mm)
diff --git a/kernel/pid.c b/kernel/pid.c
index 3a5f238c1ca0..e957f8b09136 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_fs.h>
39 40
40#define pid_hashfn(nr, ns) \ 41#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -270,8 +271,12 @@ void free_pid(struct pid *pid)
270 unsigned long flags; 271 unsigned long flags;
271 272
272 spin_lock_irqsave(&pidmap_lock, flags); 273 spin_lock_irqsave(&pidmap_lock, flags);
273 for (i = 0; i <= pid->level; i++) 274 for (i = 0; i <= pid->level; i++) {
274 hlist_del_rcu(&pid->numbers[i].pid_chain); 275 struct upid *upid = pid->numbers + i;
276 hlist_del_rcu(&upid->pid_chain);
277 if (--upid->ns->nr_hashed == 0)
278 schedule_work(&upid->ns->proc_work);
279 }
275 spin_unlock_irqrestore(&pidmap_lock, flags); 280 spin_unlock_irqrestore(&pidmap_lock, flags);
276 281
277 for (i = 0; i <= pid->level; i++) 282 for (i = 0; i <= pid->level; i++)
@@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
293 goto out; 298 goto out;
294 299
295 tmp = ns; 300 tmp = ns;
301 pid->level = ns->level;
296 for (i = ns->level; i >= 0; i--) { 302 for (i = ns->level; i >= 0; i--) {
297 nr = alloc_pidmap(tmp); 303 nr = alloc_pidmap(tmp);
298 if (nr < 0) 304 if (nr < 0)
@@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns)
303 tmp = tmp->parent; 309 tmp = tmp->parent;
304 } 310 }
305 311
312 if (unlikely(is_child_reaper(pid))) {
313 if (pid_ns_prepare_proc(ns))
314 goto out_free;
315 }
316
306 get_pid_ns(ns); 317 get_pid_ns(ns);
307 pid->level = ns->level;
308 atomic_set(&pid->count, 1); 318 atomic_set(&pid->count, 1);
309 for (type = 0; type < PIDTYPE_MAX; ++type) 319 for (type = 0; type < PIDTYPE_MAX; ++type)
310 INIT_HLIST_HEAD(&pid->tasks[type]); 320 INIT_HLIST_HEAD(&pid->tasks[type]);
311 321
312 upid = pid->numbers + ns->level; 322 upid = pid->numbers + ns->level;
313 spin_lock_irq(&pidmap_lock); 323 spin_lock_irq(&pidmap_lock);
314 for ( ; upid >= pid->numbers; --upid) 324 for ( ; upid >= pid->numbers; --upid) {
315 hlist_add_head_rcu(&upid->pid_chain, 325 hlist_add_head_rcu(&upid->pid_chain,
316 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 326 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
327 upid->ns->nr_hashed++;
328 }
317 spin_unlock_irq(&pidmap_lock); 329 spin_unlock_irq(&pidmap_lock);
318 330
319out: 331out:
@@ -570,6 +582,7 @@ void __init pidmap_init(void)
570 /* Reserve PID 0. We never call free_pidmap(0) */ 582 /* Reserve PID 0. We never call free_pidmap(0) */
571 set_bit(0, init_pid_ns.pidmap[0].page); 583 set_bit(0, init_pid_ns.pidmap[0].page);
572 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 584 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
585 init_pid_ns.nr_hashed = 1;
573 586
574 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 587 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
575 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 588 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index b2604950aa50..84591cfeefc1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -72,6 +72,12 @@ err_alloc:
72 return NULL; 72 return NULL;
73} 73}
74 74
75static void proc_cleanup_work(struct work_struct *work)
76{
77 struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
78 pid_ns_release_proc(ns);
79}
80
75/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 81/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
76#define MAX_PID_NS_LEVEL 32 82#define MAX_PID_NS_LEVEL 32
77 83
@@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
105 ns->level = level; 111 ns->level = level;
106 ns->parent = get_pid_ns(parent_pid_ns); 112 ns->parent = get_pid_ns(parent_pid_ns);
107 ns->user_ns = get_user_ns(user_ns); 113 ns->user_ns = get_user_ns(user_ns);
114 INIT_WORK(&ns->proc_work, proc_cleanup_work);
108 115
109 set_bit(0, ns->pidmap[0].page); 116 set_bit(0, ns->pidmap[0].page);
110 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 117 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
112 for (i = 1; i < PIDMAP_ENTRIES; i++) 119 for (i = 1; i < PIDMAP_ENTRIES; i++)
113 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 120 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
114 121
115 err = pid_ns_prepare_proc(ns);
116 if (err)
117 goto out_put_parent_pid_ns;
118
119 return ns; 122 return ns;
120 123
121out_put_parent_pid_ns:
122 put_pid_ns(parent_pid_ns);
123 put_user_ns(user_ns);
124out_free_map: 124out_free_map:
125 kfree(ns->pidmap[0].page); 125 kfree(ns->pidmap[0].page);
126out_free: 126out_free: