diff options
| author | Daniel Lezcano <daniel.lezcano@free.fr> | 2011-05-26 19:25:23 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-26 20:12:34 -0400 |
| commit | a77aea92010acf54ad785047234418d5d68772e2 (patch) | |
| tree | c7cb57b62fd02bee2baceb79251923f7caec6139 /kernel | |
| parent | d846687d7f84e45f23ecf3846dbb43312a1206dd (diff) | |
cgroup: remove the ns_cgroup
The ns_cgroup is an annoying cgroup at the namespace / cgroup frontier and
leads to some problems:
* cgroup creation is out-of-control
* cgroup name can conflict when pids are looping
* it is not possible to have a single process handling a lot of
namespaces without falling in a exponential creation time
* we may want to create a namespace without creating a cgroup
The ns_cgroup was replaced by a compatibility flag 'clone_children',
where a newly created cgroup will copy the parent cgroup values.
The userspace has to manually create a cgroup and add a task to
the 'tasks' file.
This patch removes the ns_cgroup as suggested in the following thread:
https://lists.linux-foundation.org/pipermail/containers/2009-June/018616.html
The 'cgroup_clone' function is removed because it is no longer used.
This is a userspace-visible change. Commit 45531757b45c ("cgroup: notify
ns_cgroup deprecated") (merged into 2.6.27) caused the kernel to emit a
printk warning users that the feature is planned for removal. Since that
time we have heard from XXX users who were affected by this.
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jamal Hadi Salim <hadi@cyberus.ca>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Makefile | 1 | ||||
| -rw-r--r-- | kernel/cgroup.c | 116 | ||||
| -rw-r--r-- | kernel/cpuset.c | 7 | ||||
| -rw-r--r-- | kernel/fork.c | 6 | ||||
| -rw-r--r-- | kernel/ns_cgroup.c | 118 | ||||
| -rw-r--r-- | kernel/nsproxy.c | 4 |
6 files changed, 3 insertions, 249 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e9cf19155b46..2d64cfcc8b42 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -61,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o | |||
| 61 | obj-$(CONFIG_CGROUPS) += cgroup.o | 61 | obj-$(CONFIG_CGROUPS) += cgroup.o |
| 62 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o | 62 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o |
| 63 | obj-$(CONFIG_CPUSETS) += cpuset.o | 63 | obj-$(CONFIG_CPUSETS) += cpuset.o |
| 64 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | ||
| 65 | obj-$(CONFIG_UTS_NS) += utsname.o | 64 | obj-$(CONFIG_UTS_NS) += utsname.o |
| 66 | obj-$(CONFIG_USER_NS) += user_namespace.o | 65 | obj-$(CONFIG_USER_NS) += user_namespace.o |
| 67 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 66 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 00a884342d3d..2731d115d725 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -4630,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
| 4630 | } | 4630 | } |
| 4631 | 4631 | ||
| 4632 | /** | 4632 | /** |
| 4633 | * cgroup_clone - clone the cgroup the given subsystem is attached to | ||
| 4634 | * @tsk: the task to be moved | ||
| 4635 | * @subsys: the given subsystem | ||
| 4636 | * @nodename: the name for the new cgroup | ||
| 4637 | * | ||
| 4638 | * Duplicate the current cgroup in the hierarchy that the given | ||
| 4639 | * subsystem is attached to, and move this task into the new | ||
| 4640 | * child. | ||
| 4641 | */ | ||
| 4642 | int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, | ||
| 4643 | char *nodename) | ||
| 4644 | { | ||
| 4645 | struct dentry *dentry; | ||
| 4646 | int ret = 0; | ||
| 4647 | struct cgroup *parent, *child; | ||
| 4648 | struct inode *inode; | ||
| 4649 | struct css_set *cg; | ||
| 4650 | struct cgroupfs_root *root; | ||
| 4651 | struct cgroup_subsys *ss; | ||
| 4652 | |||
| 4653 | /* We shouldn't be called by an unregistered subsystem */ | ||
| 4654 | BUG_ON(!subsys->active); | ||
| 4655 | |||
| 4656 | /* First figure out what hierarchy and cgroup we're dealing | ||
| 4657 | * with, and pin them so we can drop cgroup_mutex */ | ||
| 4658 | mutex_lock(&cgroup_mutex); | ||
| 4659 | again: | ||
| 4660 | root = subsys->root; | ||
| 4661 | if (root == &rootnode) { | ||
| 4662 | mutex_unlock(&cgroup_mutex); | ||
| 4663 | return 0; | ||
| 4664 | } | ||
| 4665 | |||
| 4666 | /* Pin the hierarchy */ | ||
| 4667 | if (!atomic_inc_not_zero(&root->sb->s_active)) { | ||
| 4668 | /* We race with the final deactivate_super() */ | ||
| 4669 | mutex_unlock(&cgroup_mutex); | ||
| 4670 | return 0; | ||
| 4671 | } | ||
| 4672 | |||
| 4673 | /* Keep the cgroup alive */ | ||
| 4674 | task_lock(tsk); | ||
| 4675 | parent = task_cgroup(tsk, subsys->subsys_id); | ||
| 4676 | cg = tsk->cgroups; | ||
| 4677 | get_css_set(cg); | ||
| 4678 | task_unlock(tsk); | ||
| 4679 | |||
| 4680 | mutex_unlock(&cgroup_mutex); | ||
| 4681 | |||
| 4682 | /* Now do the VFS work to create a cgroup */ | ||
| 4683 | inode = parent->dentry->d_inode; | ||
| 4684 | |||
| 4685 | /* Hold the parent directory mutex across this operation to | ||
| 4686 | * stop anyone else deleting the new cgroup */ | ||
| 4687 | mutex_lock(&inode->i_mutex); | ||
| 4688 | dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); | ||
| 4689 | if (IS_ERR(dentry)) { | ||
| 4690 | printk(KERN_INFO | ||
| 4691 | "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, | ||
| 4692 | PTR_ERR(dentry)); | ||
| 4693 | ret = PTR_ERR(dentry); | ||
| 4694 | goto out_release; | ||
| 4695 | } | ||
| 4696 | |||
| 4697 | /* Create the cgroup directory, which also creates the cgroup */ | ||
| 4698 | ret = vfs_mkdir(inode, dentry, 0755); | ||
| 4699 | child = __d_cgrp(dentry); | ||
| 4700 | dput(dentry); | ||
| 4701 | if (ret) { | ||
| 4702 | printk(KERN_INFO | ||
| 4703 | "Failed to create cgroup %s: %d\n", nodename, | ||
| 4704 | ret); | ||
| 4705 | goto out_release; | ||
| 4706 | } | ||
| 4707 | |||
| 4708 | /* The cgroup now exists. Retake cgroup_mutex and check | ||
| 4709 | * that we're still in the same state that we thought we | ||
| 4710 | * were. */ | ||
| 4711 | mutex_lock(&cgroup_mutex); | ||
| 4712 | if ((root != subsys->root) || | ||
| 4713 | (parent != task_cgroup(tsk, subsys->subsys_id))) { | ||
| 4714 | /* Aargh, we raced ... */ | ||
| 4715 | mutex_unlock(&inode->i_mutex); | ||
| 4716 | put_css_set(cg); | ||
| 4717 | |||
| 4718 | deactivate_super(root->sb); | ||
| 4719 | /* The cgroup is still accessible in the VFS, but | ||
| 4720 | * we're not going to try to rmdir() it at this | ||
| 4721 | * point. */ | ||
| 4722 | printk(KERN_INFO | ||
| 4723 | "Race in cgroup_clone() - leaking cgroup %s\n", | ||
| 4724 | nodename); | ||
| 4725 | goto again; | ||
| 4726 | } | ||
| 4727 | |||
| 4728 | /* do any required auto-setup */ | ||
| 4729 | for_each_subsys(root, ss) { | ||
| 4730 | if (ss->post_clone) | ||
| 4731 | ss->post_clone(ss, child); | ||
| 4732 | } | ||
| 4733 | |||
| 4734 | /* All seems fine. Finish by moving the task into the new cgroup */ | ||
| 4735 | ret = cgroup_attach_task(child, tsk); | ||
| 4736 | mutex_unlock(&cgroup_mutex); | ||
| 4737 | |||
| 4738 | out_release: | ||
| 4739 | mutex_unlock(&inode->i_mutex); | ||
| 4740 | |||
| 4741 | mutex_lock(&cgroup_mutex); | ||
| 4742 | put_css_set(cg); | ||
| 4743 | mutex_unlock(&cgroup_mutex); | ||
| 4744 | deactivate_super(root->sb); | ||
| 4745 | return ret; | ||
| 4746 | } | ||
| 4747 | |||
| 4748 | /** | ||
| 4749 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp | 4633 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp |
| 4750 | * @cgrp: the cgroup in question | 4634 | * @cgrp: the cgroup in question |
| 4751 | * @task: the task in question | 4635 | * @task: the task in question |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 55b297d78adc..1ceeb049c827 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -1802,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 1802 | } | 1802 | } |
| 1803 | 1803 | ||
| 1804 | /* | 1804 | /* |
| 1805 | * post_clone() is called at the end of cgroup_clone(). | 1805 | * post_clone() is called during cgroup_create() when the |
| 1806 | * 'cgroup' was just created automatically as a result of | 1806 | * clone_children mount argument was specified. The cgroup |
| 1807 | * a cgroup_clone(), and the current task is about to | 1807 | * can not yet have any tasks. |
| 1808 | * be moved into 'cgroup'. | ||
| 1809 | * | 1808 | * |
| 1810 | * Currently we refuse to set up the cgroup - thereby | 1809 | * Currently we refuse to set up the cgroup - thereby |
| 1811 | * refusing the task to be entered, and as a result refusing | 1810 | * refusing the task to be entered, and as a result refusing |
diff --git a/kernel/fork.c b/kernel/fork.c index 1fa9d940e301..1f84099ecce6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1229,12 +1229,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1229 | if (clone_flags & CLONE_THREAD) | 1229 | if (clone_flags & CLONE_THREAD) |
| 1230 | p->tgid = current->tgid; | 1230 | p->tgid = current->tgid; |
| 1231 | 1231 | ||
| 1232 | if (current->nsproxy != p->nsproxy) { | ||
| 1233 | retval = ns_cgroup_clone(p, pid); | ||
| 1234 | if (retval) | ||
| 1235 | goto bad_fork_free_pid; | ||
| 1236 | } | ||
| 1237 | |||
| 1238 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1232 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
| 1239 | /* | 1233 | /* |
| 1240 | * Clear TID on mm_release()? | 1234 | * Clear TID on mm_release()? |
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c deleted file mode 100644 index 2c98ad94ba0e..000000000000 --- a/kernel/ns_cgroup.c +++ /dev/null | |||
| @@ -1,118 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * ns_cgroup.c - namespace cgroup subsystem | ||
| 3 | * | ||
| 4 | * Copyright 2006, 2007 IBM Corp | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/module.h> | ||
| 8 | #include <linux/cgroup.h> | ||
| 9 | #include <linux/fs.h> | ||
| 10 | #include <linux/proc_fs.h> | ||
| 11 | #include <linux/slab.h> | ||
| 12 | #include <linux/nsproxy.h> | ||
| 13 | |||
| 14 | struct ns_cgroup { | ||
| 15 | struct cgroup_subsys_state css; | ||
| 16 | }; | ||
| 17 | |||
| 18 | struct cgroup_subsys ns_subsys; | ||
| 19 | |||
| 20 | static inline struct ns_cgroup *cgroup_to_ns( | ||
| 21 | struct cgroup *cgroup) | ||
| 22 | { | ||
| 23 | return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), | ||
| 24 | struct ns_cgroup, css); | ||
| 25 | } | ||
| 26 | |||
| 27 | int ns_cgroup_clone(struct task_struct *task, struct pid *pid) | ||
| 28 | { | ||
| 29 | char name[PROC_NUMBUF]; | ||
| 30 | |||
| 31 | snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); | ||
| 32 | return cgroup_clone(task, &ns_subsys, name); | ||
| 33 | } | ||
| 34 | |||
| 35 | /* | ||
| 36 | * Rules: | ||
| 37 | * 1. you can only enter a cgroup which is a descendant of your current | ||
| 38 | * cgroup | ||
| 39 | * 2. you can only place another process into a cgroup if | ||
| 40 | * a. you have CAP_SYS_ADMIN | ||
| 41 | * b. your cgroup is an ancestor of task's destination cgroup | ||
| 42 | * (hence either you are in the same cgroup as task, or in an | ||
| 43 | * ancestor cgroup thereof) | ||
| 44 | */ | ||
| 45 | static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, | ||
| 46 | struct task_struct *task, bool threadgroup) | ||
| 47 | { | ||
| 48 | if (current != task) { | ||
| 49 | if (!capable(CAP_SYS_ADMIN)) | ||
| 50 | return -EPERM; | ||
| 51 | |||
| 52 | if (!cgroup_is_descendant(new_cgroup, current)) | ||
| 53 | return -EPERM; | ||
| 54 | } | ||
| 55 | |||
| 56 | if (!cgroup_is_descendant(new_cgroup, task)) | ||
| 57 | return -EPERM; | ||
| 58 | |||
| 59 | if (threadgroup) { | ||
| 60 | struct task_struct *c; | ||
| 61 | rcu_read_lock(); | ||
| 62 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
| 63 | if (!cgroup_is_descendant(new_cgroup, c)) { | ||
| 64 | rcu_read_unlock(); | ||
| 65 | return -EPERM; | ||
| 66 | } | ||
| 67 | } | ||
| 68 | rcu_read_unlock(); | ||
| 69 | } | ||
| 70 | |||
| 71 | return 0; | ||
| 72 | } | ||
| 73 | |||
| 74 | /* | ||
| 75 | * Rules: you can only create a cgroup if | ||
| 76 | * 1. you are capable(CAP_SYS_ADMIN) | ||
| 77 | * 2. the target cgroup is a descendant of your own cgroup | ||
| 78 | */ | ||
| 79 | static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, | ||
| 80 | struct cgroup *cgroup) | ||
| 81 | { | ||
| 82 | struct ns_cgroup *ns_cgroup; | ||
| 83 | |||
| 84 | if (!capable(CAP_SYS_ADMIN)) | ||
| 85 | return ERR_PTR(-EPERM); | ||
| 86 | if (!cgroup_is_descendant(cgroup, current)) | ||
| 87 | return ERR_PTR(-EPERM); | ||
| 88 | if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { | ||
| 89 | printk("ns_cgroup can't be created with parent " | ||
| 90 | "'clone_children' set.\n"); | ||
| 91 | return ERR_PTR(-EINVAL); | ||
| 92 | } | ||
| 93 | |||
| 94 | printk_once("ns_cgroup deprecated: consider using the " | ||
| 95 | "'clone_children' flag without the ns_cgroup.\n"); | ||
| 96 | |||
| 97 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); | ||
| 98 | if (!ns_cgroup) | ||
| 99 | return ERR_PTR(-ENOMEM); | ||
| 100 | return &ns_cgroup->css; | ||
| 101 | } | ||
| 102 | |||
| 103 | static void ns_destroy(struct cgroup_subsys *ss, | ||
| 104 | struct cgroup *cgroup) | ||
| 105 | { | ||
| 106 | struct ns_cgroup *ns_cgroup; | ||
| 107 | |||
| 108 | ns_cgroup = cgroup_to_ns(cgroup); | ||
| 109 | kfree(ns_cgroup); | ||
| 110 | } | ||
| 111 | |||
| 112 | struct cgroup_subsys ns_subsys = { | ||
| 113 | .name = "ns", | ||
| 114 | .can_attach = ns_can_attach, | ||
| 115 | .create = ns_create, | ||
| 116 | .destroy = ns_destroy, | ||
| 117 | .subsys_id = ns_subsys_id, | ||
| 118 | }; | ||
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5424e37673ed..d6a00f3de15d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -201,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
| 201 | goto out; | 201 | goto out; |
| 202 | } | 202 | } |
| 203 | 203 | ||
| 204 | err = ns_cgroup_clone(current, task_pid(current)); | ||
| 205 | if (err) | ||
| 206 | put_nsproxy(*new_nsp); | ||
| 207 | |||
| 208 | out: | 204 | out: |
| 209 | return err; | 205 | return err; |
| 210 | } | 206 | } |
