aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/pid.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 18:44:47 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 18:44:47 -0500
commit6a2b60b17b3e48a418695a94bd2420f6ab32e519 (patch)
tree54b7792fa68b8890f710fa6398b6ba8626a039a8 /kernel/pid.c
parent9228ff90387e276ad67b10c0eb525c9d6a57d5e9 (diff)
parent98f842e675f96ffac96e6c50315790912b2812be (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull user namespace changes from Eric Biederman: "While small this set of changes is very significant with respect to containers in general and user namespaces in particular. The user space interface is now complete. This set of changes adds support for unprivileged users to create user namespaces and as a user namespace root to create other namespaces. The tyranny of supporting suid root preventing unprivileged users from using cool new kernel features is broken. This set of changes completes the work on setns, adding support for the pid, user, mount namespaces. This set of changes includes a bunch of basic pid namespace cleanups/simplifications. Of particular significance is the rework of the pid namespace cleanup so it no longer requires sending out tendrils into all kinds of unexpected cleanup paths for operation. At least one case of broken error handling is fixed by this cleanup. The files under /proc/<pid>/ns/ have been converted from regular files to magic symlinks which prevents incorrect caching by the VFS, ensuring the files always refer to the namespace the process is currently using and ensuring that the ptrace_mayaccess permission checks are always applied. The files under /proc/<pid>/ns/ have been given stable inode numbers so it is now possible to see if different processes share the same namespaces. Through the David Miller's net tree are changes to relax many of the permission checks in the networking stack to allowing the user namespace root to usefully use the networking stack. Similar changes for the mount namespace and the pid namespace are coming through my tree. Two small changes to add user namespace support were commited here adn in David Miller's -net tree so that I could complete the work on the /proc/<pid>/ns/ files in this tree. Work remains to make it safe to build user namespaces and 9p, afs, ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the Kconfig guard remains in place preventing that user namespaces from being built when any of those filesystems are enabled. Future design work remains to allow root users outside of the initial user namespace to mount more than just /proc and /sys." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits) proc: Usable inode numbers for the namespace file descriptors. proc: Fix the namespace inode permission checks. proc: Generalize proc inode allocation userns: Allow unprivilged mounts of proc and sysfs userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file procfs: Print task uids and gids in the userns that opened the proc file userns: Implement unshare of the user namespace userns: Implent proc namespace operations userns: Kill task_user_ns userns: Make create_new_namespaces take a user_ns parameter userns: Allow unprivileged use of setns. userns: Allow unprivileged users to create new namespaces userns: Allow setting a userns mapping to your current uid. userns: Allow chown and setgid preservation userns: Allow unprivileged users to create user namespaces. userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped userns: fix return value on mntns_install() failure vfs: Allow unprivileged manipulation of the mount namespace. vfs: Only support slave subtrees across different user namespaces vfs: Add a user namespace reference from struct mnt_namespace ...
Diffstat (limited to 'kernel/pid.c')
-rw-r--r--kernel/pid.c47
1 files changed, 39 insertions, 8 deletions
diff --git a/kernel/pid.c b/kernel/pid.c
index fd996c1ed9f8..3e2cf8100acc 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_fs.h>
39 40
40#define pid_hashfn(nr, ns) \ 41#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,6 +79,8 @@ struct pid_namespace init_pid_ns = {
78 .last_pid = 0, 79 .last_pid = 0,
79 .level = 0, 80 .level = 0,
80 .child_reaper = &init_task, 81 .child_reaper = &init_task,
82 .user_ns = &init_user_ns,
83 .proc_inum = PROC_PID_INIT_INO,
81}; 84};
82EXPORT_SYMBOL_GPL(init_pid_ns); 85EXPORT_SYMBOL_GPL(init_pid_ns);
83 86
@@ -269,8 +272,24 @@ void free_pid(struct pid *pid)
269 unsigned long flags; 272 unsigned long flags;
270 273
271 spin_lock_irqsave(&pidmap_lock, flags); 274 spin_lock_irqsave(&pidmap_lock, flags);
272 for (i = 0; i <= pid->level; i++) 275 for (i = 0; i <= pid->level; i++) {
273 hlist_del_rcu(&pid->numbers[i].pid_chain); 276 struct upid *upid = pid->numbers + i;
277 struct pid_namespace *ns = upid->ns;
278 hlist_del_rcu(&upid->pid_chain);
279 switch(--ns->nr_hashed) {
280 case 1:
281 /* When all that is left in the pid namespace
282 * is the reaper wake up the reaper. The reaper
283 * may be sleeping in zap_pid_ns_processes().
284 */
285 wake_up_process(ns->child_reaper);
286 break;
287 case 0:
288 ns->nr_hashed = -1;
289 schedule_work(&ns->proc_work);
290 break;
291 }
292 }
274 spin_unlock_irqrestore(&pidmap_lock, flags); 293 spin_unlock_irqrestore(&pidmap_lock, flags);
275 294
276 for (i = 0; i <= pid->level; i++) 295 for (i = 0; i <= pid->level; i++)
@@ -292,6 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
292 goto out; 311 goto out;
293 312
294 tmp = ns; 313 tmp = ns;
314 pid->level = ns->level;
295 for (i = ns->level; i >= 0; i--) { 315 for (i = ns->level; i >= 0; i--) {
296 nr = alloc_pidmap(tmp); 316 nr = alloc_pidmap(tmp);
297 if (nr < 0) 317 if (nr < 0)
@@ -302,22 +322,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
302 tmp = tmp->parent; 322 tmp = tmp->parent;
303 } 323 }
304 324
325 if (unlikely(is_child_reaper(pid))) {
326 if (pid_ns_prepare_proc(ns))
327 goto out_free;
328 }
329
305 get_pid_ns(ns); 330 get_pid_ns(ns);
306 pid->level = ns->level;
307 atomic_set(&pid->count, 1); 331 atomic_set(&pid->count, 1);
308 for (type = 0; type < PIDTYPE_MAX; ++type) 332 for (type = 0; type < PIDTYPE_MAX; ++type)
309 INIT_HLIST_HEAD(&pid->tasks[type]); 333 INIT_HLIST_HEAD(&pid->tasks[type]);
310 334
311 upid = pid->numbers + ns->level; 335 upid = pid->numbers + ns->level;
312 spin_lock_irq(&pidmap_lock); 336 spin_lock_irq(&pidmap_lock);
313 for ( ; upid >= pid->numbers; --upid) 337 if (ns->nr_hashed < 0)
338 goto out_unlock;
339 for ( ; upid >= pid->numbers; --upid) {
314 hlist_add_head_rcu(&upid->pid_chain, 340 hlist_add_head_rcu(&upid->pid_chain,
315 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 341 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
342 upid->ns->nr_hashed++;
343 }
316 spin_unlock_irq(&pidmap_lock); 344 spin_unlock_irq(&pidmap_lock);
317 345
318out: 346out:
319 return pid; 347 return pid;
320 348
349out_unlock:
350 spin_unlock(&pidmap_lock);
321out_free: 351out_free:
322 while (++i <= ns->level) 352 while (++i <= ns->level)
323 free_pidmap(pid->numbers + i); 353 free_pidmap(pid->numbers + i);
@@ -344,7 +374,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
344 374
345struct pid *find_vpid(int nr) 375struct pid *find_vpid(int nr)
346{ 376{
347 return find_pid_ns(nr, current->nsproxy->pid_ns); 377 return find_pid_ns(nr, task_active_pid_ns(current));
348} 378}
349EXPORT_SYMBOL_GPL(find_vpid); 379EXPORT_SYMBOL_GPL(find_vpid);
350 380
@@ -428,7 +458,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
428 458
429struct task_struct *find_task_by_vpid(pid_t vnr) 459struct task_struct *find_task_by_vpid(pid_t vnr)
430{ 460{
431 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 461 return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
432} 462}
433 463
434struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 464struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -483,7 +513,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
483 513
484pid_t pid_vnr(struct pid *pid) 514pid_t pid_vnr(struct pid *pid)
485{ 515{
486 return pid_nr_ns(pid, current->nsproxy->pid_ns); 516 return pid_nr_ns(pid, task_active_pid_ns(current));
487} 517}
488EXPORT_SYMBOL_GPL(pid_vnr); 518EXPORT_SYMBOL_GPL(pid_vnr);
489 519
@@ -494,7 +524,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
494 524
495 rcu_read_lock(); 525 rcu_read_lock();
496 if (!ns) 526 if (!ns)
497 ns = current->nsproxy->pid_ns; 527 ns = task_active_pid_ns(current);
498 if (likely(pid_alive(task))) { 528 if (likely(pid_alive(task))) {
499 if (type != PIDTYPE_PID) 529 if (type != PIDTYPE_PID)
500 task = task->group_leader; 530 task = task->group_leader;
@@ -569,6 +599,7 @@ void __init pidmap_init(void)
569 /* Reserve PID 0. We never call free_pidmap(0) */ 599 /* Reserve PID 0. We never call free_pidmap(0) */
570 set_bit(0, init_pid_ns.pidmap[0].page); 600 set_bit(0, init_pid_ns.pidmap[0].page);
571 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 601 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
602 init_pid_ns.nr_hashed = 1;
572 603
573 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 604 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
574 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 605 SLAB_HWCACHE_ALIGN | SLAB_PANIC);