From 2c392b8c3450ceb69ba1b93cb0cddb3998fb8cdc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 24 Feb 2010 19:41:39 +0100 Subject: cgroups: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Acked-by: Paul Menage Cc: Li Zefan Reviewed-by: Josh Triplett --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 192f88c5b0f9..e5c5497a7dca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,7 +138,7 @@ struct css_id { * is called after synchronize_rcu(). But for safe use, css_is_removed() * css_tryget() should be used for avoiding race. */ - struct cgroup_subsys_state *css; + struct cgroup_subsys_state __rcu *css; /* * ID of this css. */ -- cgit v1.2.2 From db71922217a214e5c9268448e537b54fc1f301ea Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Sun, 15 Aug 2010 22:51:10 +0200 Subject: BKL: Explicitly add BKL around get_sb/fill_super This patch is a preparation necessary to remove the BKL from do_new_mount(). It explicitly adds calls to lock_kernel()/unlock_kernel() around get_sb/fill_super operations for filesystems that still uses the BKL. I've read through all the code formerly covered by the BKL inside do_kern_mount() and have satisfied myself that it doesn't need the BKL any more. do_kern_mount() is already called without the BKL when mounting the rootfs and in nfsctl. do_kern_mount() calls vfs_kern_mount(), which is called from various places without BKL: simple_pin_fs(), nfs_do_clone_mount() through nfs_follow_mountpoint(), afs_mntpt_do_automount() through afs_mntpt_follow_link(). Both later functions are actually the filesystems follow_link inode operation. vfs_kern_mount() is calling the specified get_sb function and lets the filesystem do its job by calling the given fill_super function. Therefore I think it is safe to push down the BKL from the VFS to the low-level filesystems get_sb/fill_super operation. [arnd: do not add the BKL to those file systems that already don't use it elsewhere] Signed-off-by: Jan Blunck Signed-off-by: Arnd Bergmann Cc: Matthew Wilcox Cc: Christoph Hellwig --- kernel/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f6140..a7ba3bccadc5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1430,6 +1430,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *new_root; + lock_kernel(); + /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); @@ -1559,6 +1561,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); + unlock_kernel(); return 0; drop_new_super: @@ -1568,6 +1571,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); + unlock_kernel(); return ret; } -- cgit v1.2.2 From 38d018dba3f725b969f196550d92a6ec1c092428 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Wed, 24 Feb 2010 13:25:34 +0100 Subject: BKL: Remove BKL from cgroup The BKL is only used in remount_fs and get_sb that are both protected by the superblocks s_umount rw_semaphore. Therefore it is safe to remove the BKL entirely. Signed-off-by: Jan Blunck Signed-off-by: Arnd Bergmann --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7ba3bccadc5..304d27759949 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include /* TODO: replace with more sophisticated array */ @@ -1222,7 +1221,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; - lock_kernel(); mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -1255,7 +1253,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - unlock_kernel(); return ret; } @@ -1430,8 +1427,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *new_root; - lock_kernel(); - /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); @@ -1561,7 +1556,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); - unlock_kernel(); return 0; drop_new_super: @@ -1571,8 +1565,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); - unlock_kernel(); - return ret; } -- cgit v1.2.2 From 85fe4025c616a7c0ed07bc2fc8c5371b07f3888c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Oct 2010 11:19:54 -0400 Subject: fs: do not assign default i_ino in new_inode Instead of always assigning an increasing inode number in new_inode move the call to assign it into those callers that actually need it. For now callers that need it is estimated conservatively, that is the call is added to all filesystems that do not assign an i_ino by themselves. For a few more filesystems we can avoid assigning any inode number given that they aren't user visible, and for others it could be done lazily when an inode number is actually needed, but that's left for later patches. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b69b8d0313d..9270d532ec3c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -777,6 +777,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); -- cgit v1.2.2 From 97978e6d1f2da0073416870410459694fbdbfd9b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Oct 2010 15:33:35 -0700 Subject: cgroup: add clone_children control file The ns_cgroup is a control group interacting with the namespaces. When a new namespace is created, a corresponding cgroup is automatically created too. The cgroup name is the pid of the process who did 'unshare' or the child of 'clone'. This cgroup is tied with the namespace because it prevents a process to escape the control group and use the post_clone callback, so the child cgroup inherits the values of the parent cgroup. Unfortunately, the more we use this cgroup and the more we are facing problems with it: (1) when a process unshares, the cgroup name may conflict with a previous cgroup with the same pid, so unshare or clone return -EEXIST (2) the cgroup creation is out of control because there may have an application creating several namespaces where the system will automatically create several cgroups in his back and let them on the cgroupfs (eg. a vrf based on the network namespace). (3) the mix of (1) and (2) force an administrator to regularly check and clean these cgroups. This patchset removes the ns_cgroup by adding a new flag to the cgroup and the cgroupfs mount option. It enables the copy of the parent cgroup when a child cgroup is created. We can then safely remove the ns_cgroup as this flag brings a compatibility. We have now to manually create and add the task to a cgroup, which is consistent with the cgroup framework. This patch: Sent as an answer to a previous thread around the ns_cgroup. https://lists.linux-foundation.org/pipermail/containers/2009-June/018627.html It adds a control file 'clone_children' for a cgroup. This control file is a boolean specifying if the child cgroup should be a clone of the parent cgroup or not. The default value is 'false'. This flag makes the child cgroup to call the post_clone callback of all the subsystem, if it is available. At present, the cpuset is the only one which had implemented the post_clone callback. The option can be set at mount time by specifying the 'clone_children' mount option. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Acked-by: Paul Menage Reviewed-by: Li Zefan Cc: Jamal Hadi Salim Cc: Matt Helsley Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9270d532ec3c..4b218a46ddd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +static int clone_children(const struct cgroup *cgrp) +{ + return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +} + /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (clone_children(&root->top_cgroup)) + seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); @@ -1050,6 +1057,7 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + bool clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; @@ -1097,6 +1105,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); + } else if (!strcmp(token, "clone_children")) { + opts->clone_children = true; } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) @@ -1355,6 +1365,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); + if (opts->clone_children) + set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); return root; } @@ -3173,6 +3185,23 @@ fail: return ret; } +static u64 cgroup_clone_children_read(struct cgroup *cgrp, + struct cftype *cft) +{ + return clone_children(cgrp); +} + +static int cgroup_clone_children_write(struct cgroup *cgrp, + struct cftype *cft, + u64 val) +{ + if (val) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + else + clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + return 0; +} + /* * for the common functions, 'private' gives the type of file */ @@ -3203,6 +3232,11 @@ static struct cftype files[] = { .write_string = cgroup_write_event_control, .mode = S_IWUGO, }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, }; static struct cftype cft_release_agent = { @@ -3332,6 +3366,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + if (clone_children(parent)) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); @@ -3346,6 +3383,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_destroy; } /* At error, ->destroy() callback has to free assigned ID. */ + if (clone_children(parent) && ss->post_clone) + ss->post_clone(ss, cgrp); } cgroup_lock_hierarchy(root); -- cgit v1.2.2 From 32a8cf235e2f192eb002755076994525cdbaa35a Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Oct 2010 15:33:37 -0700 Subject: cgroup: make the mount options parsing more accurate Current behavior: ================= (1) When we mount a cgroup, we can specify the 'all' option which means to enable all the cgroup subsystems. This is the default option when no option is specified. (2) If we want to mount a cgroup with a subset of the supported cgroup subsystems, we have to specify a subsystems name list for the mount option. (3) If we specify another option like 'noprefix' or 'release_agent', the actual code wants the 'all' or a subsystem name option specified also. Not critical but a bit not friendly as we should assume (1) in this case. (4) Logically, the 'all' option is mutually exclusive with a subsystem name, but this is not detected. In other words: succeed : mount -t cgroup -o all,freezer cgroup /cgroup => is it 'all' or 'freezer' ? fails : mount -t cgroup -o noprefix cgroup /cgroup => succeed if we do '-o noprefix,all' The following patches consolidate a bit the mount options check. New behavior: ============= (1) untouched (2) untouched (3) the 'all' option will be by default when specifying other than a subsystem name option (4) raises an error In other words: fails : mount -t cgroup -o all,freezer cgroup /cgroup succeed : mount -t cgroup -o noprefix cgroup /cgroup For the sake of lisibility, the if ... then ... else ... if ... indentation when parsing the options has been changed to: if ... then ... continue fi Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Reviewed-by: Li Zefan Reviewed-by: Paul Menage Cc: Eric W. Biederman Cc: Jamal Hadi Salim Cc: Matt Helsley Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 90 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 30 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4b218a46ddd3..3e6517e51fd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1074,7 +1074,8 @@ struct cgroup_sb_opts { */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { - char *token, *o = data ?: "all"; + char *token, *o = data; + bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; int i; bool module_pin_failed = false; @@ -1090,24 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) while ((token = strsep(&o, ",")) != NULL) { if (!*token) return -EINVAL; - if (!strcmp(token, "all")) { - /* Add all non-disabled subsystems */ - opts->subsys_bits = 0; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (!ss->disabled) - opts->subsys_bits |= 1ul << i; - } - } else if (!strcmp(token, "none")) { + if (!strcmp(token, "none")) { /* Explicitly have no subsystems */ opts->none = true; - } else if (!strcmp(token, "noprefix")) { + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); - } else if (!strcmp(token, "clone_children")) { + continue; + } + if (!strcmp(token, "clone_children")) { opts->clone_children = true; - } else if (!strncmp(token, "release_agent=", 14)) { + continue; + } + if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; @@ -1115,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - } else if (!strncmp(token, "name=", 5)) { + continue; + } + if (!strncmp(token, "name=", 5)) { const char *name = token + 5; /* Can't specify an empty name */ if (!strlen(name)) @@ -1137,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) GFP_KERNEL); if (!opts->name) return -ENOMEM; - } else { - struct cgroup_subsys *ss; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - ss = subsys[i]; - if (ss == NULL) - continue; - if (!strcmp(token, ss->name)) { - if (!ss->disabled) - set_bit(i, &opts->subsys_bits); - break; - } - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + + continue; + } + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (strcmp(token, ss->name)) + continue; + if (ss->disabled) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + set_bit(i, &opts->subsys_bits); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise 'all, 'none' and a subsystem name options were not + * specified, let's default to 'all' + */ + if (all_ss || (!all_ss && !one_ss && !opts->none)) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (ss->disabled) + continue; + set_bit(i, &opts->subsys_bits); } } -- cgit v1.2.2 From f4a2589feaef0a9b737a3e582b37ee96695bb25f Mon Sep 17 00:00:00 2001 From: Evgeny Kuznetsov Date: Wed, 27 Oct 2010 15:33:37 -0700 Subject: cgroups: add check for strcpy destination string overflow Function "strcpy" is used without check for maximum allowed source string length and could cause destination string overflow. Check for string length is added before using "strcpy". Function now is return error if source string length is more than a maximum. akpm: presently considered NotABug, but add the check for general future-safeness and robustness. Signed-off-by: Evgeny Kuznetsov Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3e6517e51fd3..5cf366965d0c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1922,6 +1922,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + if (strlen(buffer) >= PATH_MAX) + return -EINVAL; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; strcpy(cgrp->root->release_agent_path, buffer); -- cgit v1.2.2 From f7e835710ab5f6e43933c983f38f2d2e262b718c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 26 Jul 2010 13:23:11 +0400 Subject: convert cgroup and cpuset Signed-off-by: Al Viro --- kernel/cgroup.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cf366965d0c..66a416b42c18 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1460,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb) return 0; } -static int cgroup_get_sb(struct file_system_type *fs_type, +static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) + void *data) { struct cgroup_sb_opts opts; struct cgroupfs_root *root; @@ -1596,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, drop_parsed_module_refcounts(opts.subsys_bits); } - simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); - return 0; + return dget(sb->s_root); drop_new_super: deactivate_locked_super(sb); @@ -1608,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); - return ret; + return ERR_PTR(ret); } static void cgroup_kill_sb(struct super_block *sb) { @@ -1658,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) { static struct file_system_type cgroup_fs_type = { .name = "cgroup", - .get_sb = cgroup_get_sb, + .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, }; -- cgit v1.2.2 From 5adcee1d8d32d7f305f6f5aaefdbf8f35adca177 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:20 +1100 Subject: cgroup fs: avoid switching ->d_op on live dentry Switching d_op on a live dentry is racy in general, so avoid it. In this case it is a negative dentry, which is safer, but there are still concurrent ops which may be called on d_op in that case (eg. d_revalidate). So in general a filesystem may not do this. Fix cgroupfs so as not to do this. Signed-off-by: Nick Piggin --- kernel/cgroup.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66a416b42c18..163c890f436d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -763,6 +763,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); * -> cgroup_mkdir. */ +static struct dentry *cgroup_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd); static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); @@ -2180,7 +2182,7 @@ static const struct file_operations cgroup_file_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, + .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, @@ -2196,13 +2198,29 @@ static inline struct cftype *__file_cft(struct file *file) return __d_cft(file->f_dentry); } -static int cgroup_create_file(struct dentry *dentry, mode_t mode, - struct super_block *sb) +static int cgroup_delete_dentry(struct dentry *dentry) +{ + return 1; +} + +static struct dentry *cgroup_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) { - static const struct dentry_operations cgroup_dops = { + static const struct dentry_operations cgroup_dentry_operations = { + .d_delete = cgroup_delete_dentry, .d_iput = cgroup_diput, }; + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + dentry->d_op = &cgroup_dentry_operations; + d_add(dentry, NULL); + return NULL; +} + +static int cgroup_create_file(struct dentry *dentry, mode_t mode, + struct super_block *sb) +{ struct inode *inode; if (!dentry) @@ -2228,7 +2246,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, inode->i_size = 0; inode->i_fop = &cgroup_file_operations; } - dentry->d_op = &cgroup_dops; d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ return 0; -- cgit v1.2.2 From fe15ce446beb3a33583af81ffe6c9d01a75314ed Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:23 +1100 Subject: fs: change d_delete semantics Change d_delete from a dentry deletion notification to a dentry caching advise, more like ->drop_inode. Require it to be constant and idempotent, and not take d_lock. This is how all existing filesystems use the callback anyway. This makes fine grained dentry locking of dput and dentry lru scanning much simpler. Signed-off-by: Nick Piggin --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 163c890f436d..746055b214d7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2198,7 +2198,7 @@ static inline struct cftype *__file_cft(struct file *file) return __d_cft(file->f_dentry); } -static int cgroup_delete_dentry(struct dentry *dentry) +static int cgroup_delete_dentry(const struct dentry *dentry) { return 1; } -- cgit v1.2.2 From b7ab39f631f505edc2bbdb86620d5493f995c9da Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:32 +1100 Subject: fs: dcache scale dentry refcount Make d_count non-atomic and protect it with d_lock. This allows us to ensure a 0 refcount dentry remains 0 without dcache_lock. It is also fairly natural when we start protecting many other dentry members with d_lock. Signed-off-by: Nick Piggin --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 746055b214d7..eb7af39350c6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3655,9 +3655,7 @@ again: list_del(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); - spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); - spin_unlock(&d->d_lock); cgroup_d_remove_dir(d); dput(d); -- cgit v1.2.2 From 2fd6b7f50797f2e993eea59e0a0b8c6399c811dc Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:34 +1100 Subject: fs: dcache scale subdirs Protect d_subdirs and d_child with d_lock, except in filesystems that aren't using dcache_lock for these anyway (eg. using i_mutex). Note: if we change the locking rule in future so that ->d_child protection is provided only with ->d_parent->d_lock, it may allow us to reduce some locking. But it would be an exception to an otherwise regular locking scheme, so we'd have to see some good results. Probably not worthwhile. Signed-off-by: Nick Piggin --- kernel/cgroup.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index eb7af39350c6..7b4705b51d4a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -877,23 +877,31 @@ static void cgroup_clear_directory(struct dentry *dentry) BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { struct dentry *d = list_entry(node, struct dentry, d_u.d_child); + + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(node); if (d->d_inode) { /* This should never be called on a cgroup * directory with child cgroups */ BUG_ON(d->d_inode->i_mode & S_IFDIR); - d = dget_locked(d); + dget_locked_dlock(d); + spin_unlock(&d->d_lock); + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); d_delete(d); simple_unlink(dentry->d_inode, d); dput(d); spin_lock(&dcache_lock); - } + spin_lock(&dentry->d_lock); + } else + spin_unlock(&d->d_lock); node = dentry->d_subdirs.next; } + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); } @@ -902,10 +910,17 @@ static void cgroup_clear_directory(struct dentry *dentry) */ static void cgroup_d_remove_dir(struct dentry *dentry) { + struct dentry *parent; + cgroup_clear_directory(dentry); spin_lock(&dcache_lock); + parent = dentry->d_parent; + spin_lock(&parent->d_lock); + spin_lock(&dentry->d_lock); list_del_init(&dentry->d_u.d_child); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); spin_unlock(&dcache_lock); remove_dir(dentry); } -- cgit v1.2.2 From b5c84bf6f6fa3a7dfdcb556023a62953574b60ee Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:38 +1100 Subject: fs: dcache remove dcache_lock dcache_lock no longer protects anything. remove it. Signed-off-by: Nick Piggin --- kernel/cgroup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b4705b51d4a..1864cb6a6a59 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -876,7 +876,6 @@ static void cgroup_clear_directory(struct dentry *dentry) struct list_head *node; BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { @@ -891,18 +890,15 @@ static void cgroup_clear_directory(struct dentry *dentry) dget_locked_dlock(d); spin_unlock(&d->d_lock); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); d_delete(d); simple_unlink(dentry->d_inode, d); dput(d); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); } else spin_unlock(&d->d_lock); node = dentry->d_subdirs.next; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } /* @@ -914,14 +910,12 @@ static void cgroup_d_remove_dir(struct dentry *dentry) cgroup_clear_directory(dentry); - spin_lock(&dcache_lock); parent = dentry->d_parent; spin_lock(&parent->d_lock); spin_lock(&dentry->d_lock); list_del_init(&dentry->d_u.d_child); spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - spin_unlock(&dcache_lock); remove_dir(dentry); } -- cgit v1.2.2 From dc0474be3e27463d4d4a2793f82366eed906f223 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:43 +1100 Subject: fs: dcache rationalise dget variants dget_locked was a shortcut to avoid the lazy lru manipulation when we already held dcache_lock (lru manipulation was relatively cheap at that point). However, how that the lru lock is an innermost one, we never hold it at any caller, so the lock cost can now be avoided. We already have well working lazy dcache LRU, so it should be fine to defer LRU manipulations to scan time. Signed-off-by: Nick Piggin --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1864cb6a6a59..9f41470c3949 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -887,7 +887,7 @@ static void cgroup_clear_directory(struct dentry *dentry) /* This should never be called on a cgroup * directory with child cgroups */ BUG_ON(d->d_inode->i_mode & S_IFDIR); - dget_locked_dlock(d); + dget_dlock(d); spin_unlock(&d->d_lock); spin_unlock(&dentry->d_lock); d_delete(d); -- cgit v1.2.2 From fb045adb99d9b7c562dc7fef834857f78249daa1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:55 +1100 Subject: fs: dcache reduce branches in lookup path Reduce some branches and memory accesses in dcache lookup by adding dentry flags to indicate common d_ops are set, rather than having to check them. This saves a pointer memory access (dentry->d_op) in common path lookup situations, and saves another pointer load and branch in cases where we have d_op but not the particular operation. Patched with: git grep -E '[.>]([[:space:]])*d_op([[:space:]])*=' | xargs sed -e 's/\([^\t ]*\)->d_op = \(.*\);/d_set_d_op(\1, \2);/' -e 's/\([^\t ]*\)\.d_op = \(.*\);/d_set_d_op(\&\1, \2);/' -i Signed-off-by: Nick Piggin --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9f41470c3949..51cddc11cd85 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2222,7 +2222,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - dentry->d_op = &cgroup_dentry_operations; + d_set_d_op(dentry, &cgroup_dentry_operations); d_add(dentry, NULL); return NULL; } -- cgit v1.2.2 From 0df6a63f8735a7c8a877878bc215d4312e41ef81 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Dec 2010 13:29:29 -0500 Subject: switch cgroup switching it to s_d_op allows to kill the cgroup_lookup() kludge. Signed-off-by: Al Viro --- kernel/cgroup.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 51cddc11cd85..5c5f4cc2e99a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -763,8 +763,6 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); * -> cgroup_mkdir. */ -static struct dentry *cgroup_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd); static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); @@ -1451,6 +1449,10 @@ static int cgroup_set_super(struct super_block *sb, void *data) static int cgroup_get_rootdir(struct super_block *sb) { + static const struct dentry_operations cgroup_dops = { + .d_iput = cgroup_diput, + }; + struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); struct dentry *dentry; @@ -1468,6 +1470,8 @@ static int cgroup_get_rootdir(struct super_block *sb) return -ENOMEM; } sb->s_root = dentry; + /* for everything else we want ->d_op set */ + sb->s_d_op = &cgroup_dops; return 0; } @@ -2191,7 +2195,7 @@ static const struct file_operations cgroup_file_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = cgroup_lookup, + .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, @@ -2207,26 +2211,6 @@ static inline struct cftype *__file_cft(struct file *file) return __d_cft(file->f_dentry); } -static int cgroup_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - -static struct dentry *cgroup_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) -{ - static const struct dentry_operations cgroup_dentry_operations = { - .d_delete = cgroup_delete_dentry, - .d_iput = cgroup_diput, - }; - - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_set_d_op(dentry, &cgroup_dentry_operations); - d_add(dentry, NULL); - return NULL; -} - static int cgroup_create_file(struct dentry *dentry, mode_t mode, struct super_block *sb) { -- cgit v1.2.2 From 3ec762ad8be364c2fadfe0d6b2cc6d4d3b5e1b54 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Jan 2011 11:34:34 +0800 Subject: cgroups: Fix a lockdep warning at cgroup removal Commit 2fd6b7f5 ("fs: dcache scale subdirs") forgot to annotate a dentry lock, which caused a lockdep warning. Reported-by: Valdis Kletnieks Signed-off-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 51cddc11cd85..a7837e2d9d6b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -912,7 +912,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) parent = dentry->d_parent; spin_lock(&parent->d_lock); - spin_lock(&dentry->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(&dentry->d_u.d_child); spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); -- cgit v1.2.2 From c72a04e34735ec3f19f4788b7f95017310b5e1eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Jan 2011 05:31:45 +0000 Subject: cgroup_fs: fix cgroup use of simple_lookup() cgroup can't use simple_lookup(), since that'd override its desired ->d_op. Tested-by: Li Zefan Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5c5f4cc2e99a..ffb7bbad0638 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; @@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } +static int cgroup_delete(const struct dentry *d) +{ + return 1; +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1451,6 +1457,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, + .d_delete = cgroup_delete, }; struct inode *inode = @@ -2195,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, + .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, }; +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + /* * Check if a file is a control file */ -- cgit v1.2.2 From d41d5a01631af821d3a3447e6613a316f5ee6c25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Feb 2011 17:02:20 +0100 Subject: cgroup: Fix cgroup_subsys::exit callback Make the ::exit method act like ::attach, it is after all very nearly the same thing. The bug had no effect on correctness - fixing it is an optimization for the scheduler. Also, later perf-cgroups patches rely on it. Signed-off-by: Peter Zijlstra Acked-by: Paul Menage LKML-Reference: <1297160655.13327.92.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/cgroup.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d7027b83c..f6495f33a355 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - int i; struct css_set *cg; - - if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) - ss->exit(ss, tsk); - } - } + int i; /* * Unlink from the css_set task list if necessary. @@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; + + if (run_callbacks && need_forkexit_callback) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) { + struct cgroup *old_cgrp = + rcu_dereference_raw(cg->subsys[i])->cgroup; + struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(ss, cgrp, old_cgrp, tsk); + } + } + } task_unlock(tsk); + if (cg) put_css_set_taskexit(cg); } -- cgit v1.2.2 From e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 14 Feb 2011 11:20:01 +0200 Subject: perf: Add cgroup support This kernel patch adds the ability to filter monitoring based on container groups (cgroups). This is for use in per-cpu mode only. The cgroup to monitor is passed as a file descriptor in the pid argument to the syscall. The file descriptor must be opened to the cgroup name in the cgroup filesystem. For instance, if the cgroup name is foo and cgroupfs is mounted in /cgroup, then the file descriptor is opened to /cgroup/foo. Cgroup mode is activated by passing PERF_FLAG_PID_CGROUP in the flags argument to the syscall. For instance to measure in cgroup foo on CPU1 assuming cgroupfs is mounted under /cgroup: struct perf_event_attr attr; int cgroup_fd, fd; cgroup_fd = open("/cgroup/foo", O_RDONLY); fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP); close(cgroup_fd); Signed-off-by: Stephane Eranian [ added perf_cgroup_{exit,attach} ] Signed-off-by: Peter Zijlstra LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/cgroup.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f6495f33a355..95362d15128c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +/* + * get corresponding css from file open on cgroupfs directory + */ +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +{ + struct cgroup *cgrp; + struct inode *inode; + struct cgroup_subsys_state *css; + + inode = f->f_dentry->d_inode; + /* check in cgroup filesystem dir */ + if (inode->i_op != &cgroup_dir_inode_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry); + css = cgrp->subsys[id]; + return css ? css : ERR_PTR(-ENOENT); +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, struct cgroup *cont) -- cgit v1.2.2 From 8d2587970b8bdf7c8d9208e3f4bb93182aef1a0f Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Tue, 22 Mar 2011 16:30:13 -0700 Subject: cgroups: if you list_empty() a head then don't list_del() it list_del() leaves poison in the prev and next pointers. The next list_empty() will compare those poisons, and say the list isn't empty. Any list operations that assume the node is on a list because of such a check will be fooled into dereferencing poison. One needs to INIT the node after the del, and fortunately there's already a wrapper for that - list_del_init(). Some of the dels are followed by deallocations, so can be ignored, and one can be merged with an add to make a move. Apart from that, I erred on the side of caution in making nodes list_empty()-queriable. Signed-off-by: Phil Carmody Reviewed-by: Paul Menage Cc: Li Zefan Acked-by: Kirill A. Shutemov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95362d15128c..e31b220a743d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); write_unlock(&css_set_lock); for_each_subsys(root, ss) { @@ -3655,12 +3653,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as @@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); + list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } -- cgit v1.2.2 From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 30 Mar 2011 22:57:33 -0300 Subject: Fix common misspellings Fixes generated by 'codespell' and manually reviewed. Signed-off-by: Lucas De Marchi --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e31b220a743d..25c7eb52de1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -157,7 +157,7 @@ struct css_id { }; /* - * cgroup_event represents events which userspace want to recieve. + * cgroup_event represents events which userspace want to receive. */ struct cgroup_event { /* -- cgit v1.2.2 From 30088ad815802f850f26114920ccf9effd4bc520 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:53:46 +0800 Subject: cgroup,rcu: convert call_rcu(free_css_set_rcu) to kfree_rcu() The rcu callback free_css_set_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(free_css_set_rcu). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 25c7eb52de1a..d5160a83fb35 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -326,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } -static void free_css_set_rcu(struct rcu_head *obj) -{ - struct css_set *cg = container_of(obj, struct css_set, rcu_head); - kfree(cg); -} - /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -375,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } write_unlock(&css_set_lock); - call_rcu(&cg->rcu_head, free_css_set_rcu); + kfree_rcu(cg, rcu_head); } /* -- cgit v1.2.2 From f2da1c40dc003939f616f27a103b2592f1424b07 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:55:16 +0800 Subject: cgroup,rcu: convert call_rcu(free_cgroup_rcu) to kfree_rcu() The rcu callback free_cgroup_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(free_cgroup_rcu). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d5160a83fb35..20451ce7195f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -806,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) return ret; } -static void free_cgroup_rcu(struct rcu_head *obj) -{ - struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); - - kfree(cgrp); -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -850,7 +843,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); - call_rcu(&cgrp->rcu_head, free_cgroup_rcu); + kfree_rcu(cgrp, rcu_head); } iput(inode); } -- cgit v1.2.2 From 025cea99db3fb110ebc8ede5ff647833fab9574f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:56:10 +0800 Subject: cgroup,rcu: convert call_rcu(__free_css_id_cb) to kfree_rcu() The rcu callback __free_css_id_cb() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(__free_css_id_cb). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 20451ce7195f..909a35510af5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4610,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, return ret; } -static void __free_css_id_cb(struct rcu_head *head) -{ - struct css_id *id; - - id = container_of(head, struct css_id, rcu_head); - kfree(id); -} - void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) { struct css_id *id = css->id; @@ -4632,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); spin_unlock(&ss->id_lock); - call_rcu(&id->rcu_head, __free_css_id_cb); + kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); -- cgit v1.2.2 From f780bdb7c1c73009cb57adcf99ef50027d80bf3c Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:19 -0700 Subject: cgroups: add per-thread subsystem callbacks Add cgroup subsystem callbacks for per-thread attachment in atomic contexts Add can_attach_task(), pre_attach(), and attach_task() as new callbacks for cgroups's subsystem interface. Unlike can_attach and attach, these are for per-thread operations, to be called potentially many times when attaching an entire threadgroup. Also, the old "bool threadgroup" interface is removed, as replaced by this. All subsystems are modified for the new interface - of note is cpuset, which requires from/to nodemasks for attach to be globally scoped (though per-cpuset would work too) to persist from its pre_attach to attach_task and attach. This is a pre-patch for cgroup-procs-writable.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a35510af5..38fb0ad1cb46 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1759,7 +1759,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk, false); + retval = ss->can_attach(ss, cgrp, tsk); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1771,6 +1771,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } + if (ss->can_attach_task) { + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + goto out; + } + } } task_lock(tsk); @@ -1805,8 +1812,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) write_unlock(&css_set_lock); for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + if (ss->attach_task) + ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk, false); + ss->attach(ss, cgrp, oldcgrp, tsk); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1829,7 +1840,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk, false); + ss->cancel_attach(ss, cgrp, tsk); } } return retval; -- cgit v1.2.2 From 74a1166dfe1135dcc168d35fa5261aa7e087011b Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:20 -0700 Subject: cgroups: make procs file writable Make procs file writable to move all threads by tgid at once. Add functionality that enables users to move all threads in a threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs' file. This current implementation makes use of a per-threadgroup rwsem that's taken for reading in the fork() path to prevent newly forking threads within the threadgroup from "escaping" while the move is in progress. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 439 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 393 insertions(+), 46 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38fb0ad1cb46..5e6a9745f0e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1735,6 +1735,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } EXPORT_SYMBOL_GPL(cgroup_path); +/* + * cgroup_task_migrate - move a task from one cgroup to another. + * + * 'guarantee' is set if the caller promises that a new css_set for the task + * will already exist. If not set, this function might sleep, and can fail with + * -ENOMEM. Otherwise, it can only fail with -ESRCH. + */ +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, bool guarantee) +{ + struct css_set *oldcg; + struct css_set *newcg; + + /* + * get old css_set. we need to take task_lock and refcount it, because + * an exiting task can change its css_set to init_css_set and drop its + * old one without taking cgroup_mutex. + */ + task_lock(tsk); + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + + /* locate or allocate a new css_set for this task. */ + if (guarantee) { + /* we know the css_set we want already exists. */ + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + read_lock(&css_set_lock); + newcg = find_existing_css_set(oldcg, cgrp, template); + BUG_ON(!newcg); + get_css_set(newcg); + read_unlock(&css_set_lock); + } else { + might_sleep(); + /* find_css_set will give us newcg already referenced. */ + newcg = find_css_set(oldcg, cgrp); + if (!newcg) { + put_css_set(oldcg); + return -ENOMEM; + } + } + put_css_set(oldcg); + + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + put_css_set(newcg); + return -ESRCH; + } + rcu_assign_pointer(tsk->cgroups, newcg); + task_unlock(tsk); + + /* Update the css_set linked lists if we're using them */ + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); + write_unlock(&css_set_lock); + + /* + * We just gained a reference on oldcg by taking it from the task. As + * trading it for newcg is protected by cgroup_mutex, we're safe to drop + * it here; it will be freed under RCU. + */ + put_css_set(oldcg); + + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + return 0; +} + /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1745,11 +1815,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval = 0; + int retval; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; - struct css_set *cg; - struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; /* Nothing to do if the task is already in that cgroup */ @@ -1780,36 +1848,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } - task_lock(tsk); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - /* - * Locate or allocate a new css_set for this task, - * based on its final set of cgroups - */ - newcg = find_css_set(cg, cgrp); - put_css_set(cg); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - retval = -ESRCH; + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); + if (retval) goto out; - } - rcu_assign_pointer(tsk->cgroups, newcg); - task_unlock(tsk); - - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &newcg->tasks); - write_unlock(&css_set_lock); for_each_subsys(root, ss) { if (ss->pre_attach) @@ -1819,9 +1860,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) if (ss->attach) ss->attach(ss, cgrp, oldcgrp, tsk); } - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + synchronize_rcu(); - put_css_set(cg); /* * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -1871,49 +1911,356 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex - * held. May take task_lock of task + * cgroup_attach_proc works in two stages, the first of which prefetches all + * new css_sets needed (to make sure we have enough memory before committing + * to the move) and stores them in a list of entries of the following type. + * TODO: possible optimization: use css_set->rcu_head for chaining instead + */ +struct cg_list_entry { + struct css_set *cg; + struct list_head links; +}; + +static bool css_set_check_fetched(struct cgroup *cgrp, + struct task_struct *tsk, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + + read_lock(&css_set_lock); + newcg = find_existing_css_set(cg, cgrp, template); + if (newcg) + get_css_set(newcg); + read_unlock(&css_set_lock); + + /* doesn't exist at all? */ + if (!newcg) + return false; + /* see if it's already in the list */ + list_for_each_entry(cg_entry, newcg_list, links) { + if (cg_entry->cg == newcg) { + put_css_set(newcg); + return true; + } + } + + /* not found */ + put_css_set(newcg); + return false; +} + +/* + * Find the new css_set and store it in the list in preparation for moving the + * given task to the given cgroup. Returns 0 or -ENOMEM. + */ +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + + /* ensure a new css_set will exist for this thread */ + newcg = find_css_set(cg, cgrp); + if (!newcg) + return -ENOMEM; + /* add it to the list */ + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); + if (!cg_entry) { + put_css_set(newcg); + return -ENOMEM; + } + cg_entry->cg = newcg; + list_add(&cg_entry->links, newcg_list); + return 0; +} + +/** + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * @cgrp: the cgroup to attach to + * @leader: the threadgroup leader task_struct of the group to be attached + * + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will + * take task_lock of each thread in leader's threadgroup individually in turn. + */ +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +{ + int retval, i, group_size; + struct cgroup_subsys *ss, *failed_ss = NULL; + bool cancel_failed_ss = false; + /* guaranteed to be initialized later, but the compiler needs this */ + struct cgroup *oldcgrp = NULL; + struct css_set *oldcg; + struct cgroupfs_root *root = cgrp->root; + /* threadgroup list cursor and array */ + struct task_struct *tsk; + struct task_struct **group; + /* + * we need to make sure we have css_sets for all the tasks we're + * going to move -before- we actually start moving them, so that in + * case we get an ENOMEM we can bail out before making any changes. + */ + struct list_head newcg_list; + struct cg_list_entry *cg_entry, *temp_nobe; + + /* + * step 0: in order to do expensive, possibly blocking operations for + * every thread, we cannot iterate the thread group list, since it needs + * rcu or tasklist locked. instead, build an array of all threads in the + * group - threadgroup_fork_lock prevents new threads from appearing, + * and if threads exit, this will just be an over-estimate. + */ + group_size = get_nr_threads(leader); + group = kmalloc(group_size * sizeof(*group), GFP_KERNEL); + if (!group) + return -ENOMEM; + + /* prevent changes to the threadgroup list while we take a snapshot. */ + rcu_read_lock(); + if (!thread_group_leader(leader)) { + /* + * a race with de_thread from another thread's exec() may strip + * us of our leadership, making while_each_thread unsafe to use + * on this task. if this happens, there is no choice but to + * throw this task away and try again (from cgroup_procs_write); + * this is "double-double-toil-and-trouble-check locking". + */ + rcu_read_unlock(); + retval = -EAGAIN; + goto out_free_group_list; + } + /* take a reference on each task in the group to go in the array. */ + tsk = leader; + i = 0; + do { + /* as per above, nr_threads may decrease, but not increase. */ + BUG_ON(i >= group_size); + get_task_struct(tsk); + group[i] = tsk; + i++; + } while_each_thread(leader, tsk); + /* remember the number of threads in the array for later. */ + group_size = i; + rcu_read_unlock(); + + /* + * step 1: check that we can legitimately attach to the cgroup. + */ + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, leader); + if (retval) { + failed_ss = ss; + goto out_cancel_attach; + } + } + /* a callback to be run on every thread in the threadgroup. */ + if (ss->can_attach_task) { + /* run on each task in the threadgroup. */ + for (i = 0; i < group_size; i++) { + retval = ss->can_attach_task(cgrp, group[i]); + if (retval) { + failed_ss = ss; + cancel_failed_ss = true; + goto out_cancel_attach; + } + } + } + } + + /* + * step 2: make sure css_sets exist for all threads to be migrated. + * we use find_css_set, which allocates a new one if necessary. + */ + INIT_LIST_HEAD(&newcg_list); + for (i = 0; i < group_size; i++) { + tsk = group[i]; + /* nothing to do if this task is already in the cgroup */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* get old css_set pointer */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + /* ignore this task if it's going away */ + task_unlock(tsk); + continue; + } + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + /* see if the new one for us is already in the list? */ + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { + /* was already there, nothing to do. */ + put_css_set(oldcg); + } else { + /* we don't already have it. get new one. */ + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); + put_css_set(oldcg); + if (retval) + goto out_list_teardown; + } + } + + /* + * step 3: now that we're guaranteed success wrt the css_sets, proceed + * to move all tasks to the new cgroup, calling ss->attach_task for each + * one along the way. there are no failure cases after here, so this is + * the commit point. + */ + for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + } + for (i = 0; i < group_size; i++) { + tsk = group[i]; + /* leave current thread as it is if it's already there */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + /* if the thread is PF_EXITING, it can just get skipped. */ + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); + BUG_ON(retval != 0 && retval != -ESRCH); + } + /* nothing is sensitive to fork() after this point. */ + + /* + * step 4: do expensive, non-thread-specific subsystem callbacks. + * TODO: if ever a subsystem needs to know the oldcgrp for each task + * being moved, this call will need to be reworked to communicate that. + */ + for_each_subsys(root, ss) { + if (ss->attach) + ss->attach(ss, cgrp, oldcgrp, leader); + } + + /* + * step 5: success! and cleanup + */ + synchronize_rcu(); + cgroup_wakeup_rmdir_waiter(cgrp); + retval = 0; +out_list_teardown: + /* clean up the list of prefetched css_sets. */ + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { + list_del(&cg_entry->links); + put_css_set(cg_entry->cg); + kfree(cg_entry); + } +out_cancel_attach: + /* same deal as in cgroup_attach_task */ + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) { + if (cancel_failed_ss && ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + break; + } + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + } + } + /* clean up the array of referenced threads in the group. */ + for (i = 0; i < group_size; i++) + put_task_struct(group[i]); +out_free_group_list: + kfree(group); + return retval; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will take + * cgroup_mutex; may take task_lock of task. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk || tsk->flags & PF_EXITING) { + if (!tsk) { rcu_read_unlock(); + cgroup_unlock(); + return -ESRCH; + } + if (threadgroup) { + /* + * RCU protects this access, since tsk was found in the + * tid map. a race with de_thread may cause group_leader + * to stop being the leader, but cgroup_attach_proc will + * detect it later. + */ + tsk = tsk->group_leader; + } else if (tsk->flags & PF_EXITING) { + /* optimization for the single-task-only case */ + rcu_read_unlock(); + cgroup_unlock(); return -ESRCH; } + /* + * even if we're attaching all tasks in the thread group, we + * only need to check permissions on one of them. + */ tcred = __task_cred(tsk); if (cred->euid && cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); + cgroup_unlock(); return -EACCES; } get_task_struct(tsk); rcu_read_unlock(); } else { - tsk = current; + if (threadgroup) + tsk = current->group_leader; + else + tsk = current; get_task_struct(tsk); } - ret = cgroup_attach_task(cgrp, tsk); + if (threadgroup) { + threadgroup_fork_write_lock(tsk); + ret = cgroup_attach_proc(cgrp, tsk); + threadgroup_fork_write_unlock(tsk); + } else { + ret = cgroup_attach_task(cgrp, tsk); + } put_task_struct(tsk); + cgroup_unlock(); return ret; } static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +{ + return attach_task_by_pid(cgrp, pid, false); +} + +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) { int ret; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - ret = attach_task_by_pid(cgrp, pid); - cgroup_unlock(); + do { + /* + * attach_proc fails with -EAGAIN if threadgroup leadership + * changes in the middle of the operation, in which case we need + * to find the task_struct for the new leader and start over. + */ + ret = attach_task_by_pid(cgrp, tgid, true); + } while (ret == -EAGAIN); return ret; } @@ -3270,9 +3617,9 @@ static struct cftype files[] = { { .name = CGROUP_FILE_GENERIC_PREFIX "procs", .open = cgroup_procs_open, - /* .write_u64 = cgroup_procs_write, TODO */ + .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, - .mode = S_IRUGO, + .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", -- cgit v1.2.2 From d846687d7f84e45f23ecf3846dbb43312a1206dd Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:21 -0700 Subject: cgroups: use flex_array in attach_proc Convert cgroup_attach_proc to use flex_array. The cgroup_attach_proc implementation requires a pre-allocated array to store task pointers to atomically move a thread-group, but asking for a monolithic array with kmalloc() may be unreliable for very large groups. Using flex_array provides the same functionality with less risk of failure. This is a post-patch for cgroup-procs-write.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5e6a9745f0e7..00a884342d3d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,6 +57,7 @@ #include /* TODO: replace with more sophisticated array */ #include #include +#include /* used in cgroup_attach_proc */ #include @@ -1995,7 +1996,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; - struct task_struct **group; + struct flex_array *group; /* * we need to make sure we have css_sets for all the tasks we're * going to move -before- we actually start moving them, so that in @@ -2012,9 +2013,15 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * and if threads exit, this will just be an over-estimate. */ group_size = get_nr_threads(leader); - group = kmalloc(group_size * sizeof(*group), GFP_KERNEL); + /* flex_array supports very large thread-groups better than kmalloc. */ + group = flex_array_alloc(sizeof(struct task_struct *), group_size, + GFP_KERNEL); if (!group) return -ENOMEM; + /* pre-allocate to guarantee space while iterating in rcu read-side. */ + retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + if (retval) + goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ rcu_read_lock(); @@ -2037,7 +2044,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); get_task_struct(tsk); - group[i] = tsk; + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ + retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ @@ -2059,7 +2071,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (ss->can_attach_task) { /* run on each task in the threadgroup. */ for (i = 0; i < group_size; i++) { - retval = ss->can_attach_task(cgrp, group[i]); + tsk = flex_array_get_ptr(group, i); + retval = ss->can_attach_task(cgrp, tsk); if (retval) { failed_ss = ss; cancel_failed_ss = true; @@ -2075,7 +2088,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { - tsk = group[i]; + tsk = flex_array_get_ptr(group, i); /* nothing to do if this task is already in the cgroup */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) @@ -2114,7 +2127,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) ss->pre_attach(cgrp); } for (i = 0; i < group_size; i++) { - tsk = group[i]; + tsk = flex_array_get_ptr(group, i); /* leave current thread as it is if it's already there */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) @@ -2167,10 +2180,12 @@ out_cancel_attach: } } /* clean up the array of referenced threads in the group. */ - for (i = 0; i < group_size; i++) - put_task_struct(group[i]); + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + put_task_struct(tsk); + } out_free_group_list: - kfree(group); + flex_array_free(group); return retval; } -- cgit v1.2.2 From a77aea92010acf54ad785047234418d5d68772e2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 26 May 2011 16:25:23 -0700 Subject: cgroup: remove the ns_cgroup The ns_cgroup is an annoying cgroup at the namespace / cgroup frontier and leads to some problems: * cgroup creation is out-of-control * cgroup name can conflict when pids are looping * it is not possible to have a single process handling a lot of namespaces without falling in a exponential creation time * we may want to create a namespace without creating a cgroup The ns_cgroup was replaced by a compatibility flag 'clone_children', where a newly created cgroup will copy the parent cgroup values. The userspace has to manually create a cgroup and add a task to the 'tasks' file. This patch removes the ns_cgroup as suggested in the following thread: https://lists.linux-foundation.org/pipermail/containers/2009-June/018616.html The 'cgroup_clone' function is removed because it is no longer used. This is a userspace-visible change. Commit 45531757b45c ("cgroup: notify ns_cgroup deprecated") (merged into 2.6.27) caused the kernel to emit a printk warning users that the feature is planned for removal. Since that time we have heard from XXX users who were affected by this. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Cc: Jamal Hadi Salim Reviewed-by: Li Zefan Acked-by: Paul Menage Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 116 -------------------------------------------------------- 1 file changed, 116 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 00a884342d3d..2731d115d725 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4629,122 +4629,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) put_css_set_taskexit(cg); } -/** - * cgroup_clone - clone the cgroup the given subsystem is attached to - * @tsk: the task to be moved - * @subsys: the given subsystem - * @nodename: the name for the new cgroup - * - * Duplicate the current cgroup in the hierarchy that the given - * subsystem is attached to, and move this task into the new - * child. - */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, - char *nodename) -{ - struct dentry *dentry; - int ret = 0; - struct cgroup *parent, *child; - struct inode *inode; - struct css_set *cg; - struct cgroupfs_root *root; - struct cgroup_subsys *ss; - - /* We shouldn't be called by an unregistered subsystem */ - BUG_ON(!subsys->active); - - /* First figure out what hierarchy and cgroup we're dealing - * with, and pin them so we can drop cgroup_mutex */ - mutex_lock(&cgroup_mutex); - again: - root = subsys->root; - if (root == &rootnode) { - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&root->sb->s_active)) { - /* We race with the final deactivate_super() */ - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Keep the cgroup alive */ - task_lock(tsk); - parent = task_cgroup(tsk, subsys->subsys_id); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - - mutex_unlock(&cgroup_mutex); - - /* Now do the VFS work to create a cgroup */ - inode = parent->dentry->d_inode; - - /* Hold the parent directory mutex across this operation to - * stop anyone else deleting the new cgroup */ - mutex_lock(&inode->i_mutex); - dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); - if (IS_ERR(dentry)) { - printk(KERN_INFO - "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, - PTR_ERR(dentry)); - ret = PTR_ERR(dentry); - goto out_release; - } - - /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, 0755); - child = __d_cgrp(dentry); - dput(dentry); - if (ret) { - printk(KERN_INFO - "Failed to create cgroup %s: %d\n", nodename, - ret); - goto out_release; - } - - /* The cgroup now exists. Retake cgroup_mutex and check - * that we're still in the same state that we thought we - * were. */ - mutex_lock(&cgroup_mutex); - if ((root != subsys->root) || - (parent != task_cgroup(tsk, subsys->subsys_id))) { - /* Aargh, we raced ... */ - mutex_unlock(&inode->i_mutex); - put_css_set(cg); - - deactivate_super(root->sb); - /* The cgroup is still accessible in the VFS, but - * we're not going to try to rmdir() it at this - * point. */ - printk(KERN_INFO - "Race in cgroup_clone() - leaking cgroup %s\n", - nodename); - goto again; - } - - /* do any required auto-setup */ - for_each_subsys(root, ss) { - if (ss->post_clone) - ss->post_clone(ss, child); - } - - /* All seems fine. Finish by moving the task into the new cgroup */ - ret = cgroup_attach_task(child, tsk); - mutex_unlock(&cgroup_mutex); - - out_release: - mutex_unlock(&inode->i_mutex); - - mutex_lock(&cgroup_mutex); - put_css_set(cg); - mutex_unlock(&cgroup_mutex); - deactivate_super(root->sb); - return ret; -} - /** * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp * @cgrp: the cgroup in question -- cgit v1.2.2