diff options
author | Eric W. Biederman <ebiederm@xmission.com> | 2012-08-01 13:33:47 -0400 |
---|---|---|
committer | Eric W. Biederman <ebiederm@xmission.com> | 2012-11-19 08:59:10 -0500 |
commit | 0a01f2cc390e10633a54f72c608cc3fe19a50c3d (patch) | |
tree | e713a1c45b5ce125a5d33b61d528cd45264d47a7 | |
parent | 17cf22c33e1f1b5e435469c84e43872579497653 (diff) |
pidns: Make the pidns proc mount/umount logic obvious.
Track the number of pids in the proc hash table. When the number of
pids goes to 0 schedule work to unmount the kernel mount of proc.
Move the mount of proc into alloc_pid when we allocate the pid for
init.
Remove the surprising calls of pid_ns_release proc in fork and
proc_flush_task. Those code paths really shouldn't know about proc
namespace implementation details and people have demonstrated several
times that finding and understanding those code paths is difficult and
non-obvious.
Because of the call path detach pid is alwasy called with the
rtnl_lock held free_pid is not allowed to sleep, so the work to
unmounting proc is moved to a work queue. This has the side benefit
of not blocking the entire world waiting for the unnecessary
rcu_barrier in deactivate_locked_super.
In the process of making the code clear and obvious this fixes a bug
reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a
mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns
succeeded and copy_net_ns failed.
Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
-rw-r--r-- | fs/proc/base.c | 4 | ||||
-rw-r--r-- | fs/proc/root.c | 5 | ||||
-rw-r--r-- | include/linux/pid_namespace.h | 2 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/pid.c | 21 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 14 |
6 files changed, 26 insertions, 22 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c index 6177fc238fdb..7621dc51cff8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task) | |||
2590 | proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, | 2590 | proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, |
2591 | tgid->numbers[i].nr); | 2591 | tgid->numbers[i].nr); |
2592 | } | 2592 | } |
2593 | |||
2594 | upid = &pid->numbers[pid->level]; | ||
2595 | if (upid->nr == 1) | ||
2596 | pid_ns_release_proc(upid->ns); | ||
2597 | } | 2593 | } |
2598 | 2594 | ||
2599 | static struct dentry *proc_pid_instantiate(struct inode *dir, | 2595 | static struct dentry *proc_pid_instantiate(struct inode *dir, |
diff --git a/fs/proc/root.c b/fs/proc/root.c index fc1609321a78..f2f251158d35 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c | |||
@@ -155,11 +155,6 @@ void __init proc_root_init(void) | |||
155 | err = register_filesystem(&proc_fs_type); | 155 | err = register_filesystem(&proc_fs_type); |
156 | if (err) | 156 | if (err) |
157 | return; | 157 | return; |
158 | err = pid_ns_prepare_proc(&init_pid_ns); | ||
159 | if (err) { | ||
160 | unregister_filesystem(&proc_fs_type); | ||
161 | return; | ||
162 | } | ||
163 | 158 | ||
164 | proc_self_init(); | 159 | proc_self_init(); |
165 | proc_symlink("mounts", NULL, "self/mounts"); | 160 | proc_symlink("mounts", NULL, "self/mounts"); |
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c89c9cfcd247..4c96acdb2489 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h | |||
@@ -21,6 +21,7 @@ struct pid_namespace { | |||
21 | struct kref kref; | 21 | struct kref kref; |
22 | struct pidmap pidmap[PIDMAP_ENTRIES]; | 22 | struct pidmap pidmap[PIDMAP_ENTRIES]; |
23 | int last_pid; | 23 | int last_pid; |
24 | int nr_hashed; | ||
24 | struct task_struct *child_reaper; | 25 | struct task_struct *child_reaper; |
25 | struct kmem_cache *pid_cachep; | 26 | struct kmem_cache *pid_cachep; |
26 | unsigned int level; | 27 | unsigned int level; |
@@ -32,6 +33,7 @@ struct pid_namespace { | |||
32 | struct bsd_acct_struct *bacct; | 33 | struct bsd_acct_struct *bacct; |
33 | #endif | 34 | #endif |
34 | struct user_namespace *user_ns; | 35 | struct user_namespace *user_ns; |
36 | struct work_struct proc_work; | ||
35 | kgid_t pid_gid; | 37 | kgid_t pid_gid; |
36 | int hide_pid; | 38 | int hide_pid; |
37 | int reboot; /* group exit code if this pidns was rebooted */ | 39 | int reboot; /* group exit code if this pidns was rebooted */ |
diff --git a/kernel/fork.c b/kernel/fork.c index 7798c247f4b9..666dc8b06606 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1476,8 +1476,6 @@ bad_fork_cleanup_io: | |||
1476 | if (p->io_context) | 1476 | if (p->io_context) |
1477 | exit_io_context(p); | 1477 | exit_io_context(p); |
1478 | bad_fork_cleanup_namespaces: | 1478 | bad_fork_cleanup_namespaces: |
1479 | if (unlikely(clone_flags & CLONE_NEWPID)) | ||
1480 | pid_ns_release_proc(p->nsproxy->pid_ns); | ||
1481 | exit_task_namespaces(p); | 1479 | exit_task_namespaces(p); |
1482 | bad_fork_cleanup_mm: | 1480 | bad_fork_cleanup_mm: |
1483 | if (p->mm) | 1481 | if (p->mm) |
diff --git a/kernel/pid.c b/kernel/pid.c index 3a5f238c1ca0..e957f8b09136 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/pid_namespace.h> | 36 | #include <linux/pid_namespace.h> |
37 | #include <linux/init_task.h> | 37 | #include <linux/init_task.h> |
38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
39 | #include <linux/proc_fs.h> | ||
39 | 40 | ||
40 | #define pid_hashfn(nr, ns) \ | 41 | #define pid_hashfn(nr, ns) \ |
41 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) | 42 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) |
@@ -270,8 +271,12 @@ void free_pid(struct pid *pid) | |||
270 | unsigned long flags; | 271 | unsigned long flags; |
271 | 272 | ||
272 | spin_lock_irqsave(&pidmap_lock, flags); | 273 | spin_lock_irqsave(&pidmap_lock, flags); |
273 | for (i = 0; i <= pid->level; i++) | 274 | for (i = 0; i <= pid->level; i++) { |
274 | hlist_del_rcu(&pid->numbers[i].pid_chain); | 275 | struct upid *upid = pid->numbers + i; |
276 | hlist_del_rcu(&upid->pid_chain); | ||
277 | if (--upid->ns->nr_hashed == 0) | ||
278 | schedule_work(&upid->ns->proc_work); | ||
279 | } | ||
275 | spin_unlock_irqrestore(&pidmap_lock, flags); | 280 | spin_unlock_irqrestore(&pidmap_lock, flags); |
276 | 281 | ||
277 | for (i = 0; i <= pid->level; i++) | 282 | for (i = 0; i <= pid->level; i++) |
@@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
293 | goto out; | 298 | goto out; |
294 | 299 | ||
295 | tmp = ns; | 300 | tmp = ns; |
301 | pid->level = ns->level; | ||
296 | for (i = ns->level; i >= 0; i--) { | 302 | for (i = ns->level; i >= 0; i--) { |
297 | nr = alloc_pidmap(tmp); | 303 | nr = alloc_pidmap(tmp); |
298 | if (nr < 0) | 304 | if (nr < 0) |
@@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
303 | tmp = tmp->parent; | 309 | tmp = tmp->parent; |
304 | } | 310 | } |
305 | 311 | ||
312 | if (unlikely(is_child_reaper(pid))) { | ||
313 | if (pid_ns_prepare_proc(ns)) | ||
314 | goto out_free; | ||
315 | } | ||
316 | |||
306 | get_pid_ns(ns); | 317 | get_pid_ns(ns); |
307 | pid->level = ns->level; | ||
308 | atomic_set(&pid->count, 1); | 318 | atomic_set(&pid->count, 1); |
309 | for (type = 0; type < PIDTYPE_MAX; ++type) | 319 | for (type = 0; type < PIDTYPE_MAX; ++type) |
310 | INIT_HLIST_HEAD(&pid->tasks[type]); | 320 | INIT_HLIST_HEAD(&pid->tasks[type]); |
311 | 321 | ||
312 | upid = pid->numbers + ns->level; | 322 | upid = pid->numbers + ns->level; |
313 | spin_lock_irq(&pidmap_lock); | 323 | spin_lock_irq(&pidmap_lock); |
314 | for ( ; upid >= pid->numbers; --upid) | 324 | for ( ; upid >= pid->numbers; --upid) { |
315 | hlist_add_head_rcu(&upid->pid_chain, | 325 | hlist_add_head_rcu(&upid->pid_chain, |
316 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); | 326 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); |
327 | upid->ns->nr_hashed++; | ||
328 | } | ||
317 | spin_unlock_irq(&pidmap_lock); | 329 | spin_unlock_irq(&pidmap_lock); |
318 | 330 | ||
319 | out: | 331 | out: |
@@ -570,6 +582,7 @@ void __init pidmap_init(void) | |||
570 | /* Reserve PID 0. We never call free_pidmap(0) */ | 582 | /* Reserve PID 0. We never call free_pidmap(0) */ |
571 | set_bit(0, init_pid_ns.pidmap[0].page); | 583 | set_bit(0, init_pid_ns.pidmap[0].page); |
572 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); | 584 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
585 | init_pid_ns.nr_hashed = 1; | ||
573 | 586 | ||
574 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, | 587 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, |
575 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 588 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b2604950aa50..84591cfeefc1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -72,6 +72,12 @@ err_alloc: | |||
72 | return NULL; | 72 | return NULL; |
73 | } | 73 | } |
74 | 74 | ||
75 | static void proc_cleanup_work(struct work_struct *work) | ||
76 | { | ||
77 | struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); | ||
78 | pid_ns_release_proc(ns); | ||
79 | } | ||
80 | |||
75 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ | 81 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ |
76 | #define MAX_PID_NS_LEVEL 32 | 82 | #define MAX_PID_NS_LEVEL 32 |
77 | 83 | ||
@@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns | |||
105 | ns->level = level; | 111 | ns->level = level; |
106 | ns->parent = get_pid_ns(parent_pid_ns); | 112 | ns->parent = get_pid_ns(parent_pid_ns); |
107 | ns->user_ns = get_user_ns(user_ns); | 113 | ns->user_ns = get_user_ns(user_ns); |
114 | INIT_WORK(&ns->proc_work, proc_cleanup_work); | ||
108 | 115 | ||
109 | set_bit(0, ns->pidmap[0].page); | 116 | set_bit(0, ns->pidmap[0].page); |
110 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); | 117 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); |
@@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns | |||
112 | for (i = 1; i < PIDMAP_ENTRIES; i++) | 119 | for (i = 1; i < PIDMAP_ENTRIES; i++) |
113 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | 120 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); |
114 | 121 | ||
115 | err = pid_ns_prepare_proc(ns); | ||
116 | if (err) | ||
117 | goto out_put_parent_pid_ns; | ||
118 | |||
119 | return ns; | 122 | return ns; |
120 | 123 | ||
121 | out_put_parent_pid_ns: | ||
122 | put_pid_ns(parent_pid_ns); | ||
123 | put_user_ns(user_ns); | ||
124 | out_free_map: | 124 | out_free_map: |
125 | kfree(ns->pidmap[0].page); | 125 | kfree(ns->pidmap[0].page); |
126 | out_free: | 126 | out_free: |