diff options
author | Eric W. Biederman <ebiederm@xmission.com> | 2012-08-01 13:33:47 -0400 |
---|---|---|
committer | Eric W. Biederman <ebiederm@xmission.com> | 2012-11-19 08:59:10 -0500 |
commit | 0a01f2cc390e10633a54f72c608cc3fe19a50c3d (patch) | |
tree | e713a1c45b5ce125a5d33b61d528cd45264d47a7 /kernel/pid.c | |
parent | 17cf22c33e1f1b5e435469c84e43872579497653 (diff) |
pidns: Make the pidns proc mount/umount logic obvious.
Track the number of pids in the proc hash table. When the number of
pids goes to 0 schedule work to unmount the kernel mount of proc.
Move the mount of proc into alloc_pid when we allocate the pid for
init.
Remove the surprising calls of pid_ns_release proc in fork and
proc_flush_task. Those code paths really shouldn't know about proc
namespace implementation details and people have demonstrated several
times that finding and understanding those code paths is difficult and
non-obvious.
Because of the call path detach pid is alwasy called with the
rtnl_lock held free_pid is not allowed to sleep, so the work to
unmounting proc is moved to a work queue. This has the side benefit
of not blocking the entire world waiting for the unnecessary
rcu_barrier in deactivate_locked_super.
In the process of making the code clear and obvious this fixes a bug
reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a
mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns
succeeded and copy_net_ns failed.
Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Diffstat (limited to 'kernel/pid.c')
-rw-r--r-- | kernel/pid.c | 21 |
1 files changed, 17 insertions, 4 deletions
diff --git a/kernel/pid.c b/kernel/pid.c index 3a5f238c1ca0..e957f8b09136 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/pid_namespace.h> | 36 | #include <linux/pid_namespace.h> |
37 | #include <linux/init_task.h> | 37 | #include <linux/init_task.h> |
38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
39 | #include <linux/proc_fs.h> | ||
39 | 40 | ||
40 | #define pid_hashfn(nr, ns) \ | 41 | #define pid_hashfn(nr, ns) \ |
41 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) | 42 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) |
@@ -270,8 +271,12 @@ void free_pid(struct pid *pid) | |||
270 | unsigned long flags; | 271 | unsigned long flags; |
271 | 272 | ||
272 | spin_lock_irqsave(&pidmap_lock, flags); | 273 | spin_lock_irqsave(&pidmap_lock, flags); |
273 | for (i = 0; i <= pid->level; i++) | 274 | for (i = 0; i <= pid->level; i++) { |
274 | hlist_del_rcu(&pid->numbers[i].pid_chain); | 275 | struct upid *upid = pid->numbers + i; |
276 | hlist_del_rcu(&upid->pid_chain); | ||
277 | if (--upid->ns->nr_hashed == 0) | ||
278 | schedule_work(&upid->ns->proc_work); | ||
279 | } | ||
275 | spin_unlock_irqrestore(&pidmap_lock, flags); | 280 | spin_unlock_irqrestore(&pidmap_lock, flags); |
276 | 281 | ||
277 | for (i = 0; i <= pid->level; i++) | 282 | for (i = 0; i <= pid->level; i++) |
@@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
293 | goto out; | 298 | goto out; |
294 | 299 | ||
295 | tmp = ns; | 300 | tmp = ns; |
301 | pid->level = ns->level; | ||
296 | for (i = ns->level; i >= 0; i--) { | 302 | for (i = ns->level; i >= 0; i--) { |
297 | nr = alloc_pidmap(tmp); | 303 | nr = alloc_pidmap(tmp); |
298 | if (nr < 0) | 304 | if (nr < 0) |
@@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
303 | tmp = tmp->parent; | 309 | tmp = tmp->parent; |
304 | } | 310 | } |
305 | 311 | ||
312 | if (unlikely(is_child_reaper(pid))) { | ||
313 | if (pid_ns_prepare_proc(ns)) | ||
314 | goto out_free; | ||
315 | } | ||
316 | |||
306 | get_pid_ns(ns); | 317 | get_pid_ns(ns); |
307 | pid->level = ns->level; | ||
308 | atomic_set(&pid->count, 1); | 318 | atomic_set(&pid->count, 1); |
309 | for (type = 0; type < PIDTYPE_MAX; ++type) | 319 | for (type = 0; type < PIDTYPE_MAX; ++type) |
310 | INIT_HLIST_HEAD(&pid->tasks[type]); | 320 | INIT_HLIST_HEAD(&pid->tasks[type]); |
311 | 321 | ||
312 | upid = pid->numbers + ns->level; | 322 | upid = pid->numbers + ns->level; |
313 | spin_lock_irq(&pidmap_lock); | 323 | spin_lock_irq(&pidmap_lock); |
314 | for ( ; upid >= pid->numbers; --upid) | 324 | for ( ; upid >= pid->numbers; --upid) { |
315 | hlist_add_head_rcu(&upid->pid_chain, | 325 | hlist_add_head_rcu(&upid->pid_chain, |
316 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); | 326 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); |
327 | upid->ns->nr_hashed++; | ||
328 | } | ||
317 | spin_unlock_irq(&pidmap_lock); | 329 | spin_unlock_irq(&pidmap_lock); |
318 | 330 | ||
319 | out: | 331 | out: |
@@ -570,6 +582,7 @@ void __init pidmap_init(void) | |||
570 | /* Reserve PID 0. We never call free_pidmap(0) */ | 582 | /* Reserve PID 0. We never call free_pidmap(0) */ |
571 | set_bit(0, init_pid_ns.pidmap[0].page); | 583 | set_bit(0, init_pid_ns.pidmap[0].page); |
572 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); | 584 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
585 | init_pid_ns.nr_hashed = 1; | ||
573 | 586 | ||
574 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, | 587 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, |
575 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 588 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); |