aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2014-02-11 11:52:49 -0500
committerTejun Heo <tj@kernel.org>2014-02-11 11:52:49 -0500
commit2bd59d48ebfb3df41ee56938946ca0dd30887312 (patch)
tree8e18b5e94b96f42418113687f27c61fa1ec5e147 /kernel/cgroup.c
parentf2e85d574e881ff3c597518c1ab48c86f9109880 (diff)
cgroup: convert to kernfs
cgroup filesystem code was derived from the original sysfs implementation which was heavily intertwined with vfs objects and locking with the goal of re-using the existing vfs infrastructure. That experiment turned out rather disastrous and sysfs switched, a long time ago, to distributed filesystem model where a separate representation is maintained which is queried by vfs. Unfortunately, cgroup stuck with the failed experiment all these years and accumulated even more problems over time. Locking and object lifetime management being entangled with vfs is probably the most egregious. vfs is never designed to be misused like this and cgroup ends up jumping through various convoluted dancing to make things work. Even then, operations across multiple cgroups can't be done safely as it'll deadlock with rename locking. Recently, kernfs is separated out from sysfs so that it can be used by users other than sysfs. This patch converts cgroup to use kernfs, which will bring the following benefits. * Separation from vfs internals. Locking and object lifetime management is contained in cgroup proper making things a lot simpler. This removes significant amount of locking convolutions, hairy object lifetime rules and the restriction on multi-cgroup operations. * Can drop a lot of code to implement filesystem interface as most are provided by kernfs. * Proper "severing" semantics, which allows controllers to not worry about lingering file accesses after offline. While the preceding patches did as much as possible to make the transition less painful, large part of the conversion has to be one discrete step making this patch rather large. The rest of the commit message lists notable changes in different areas. Overall ------- * vfs constructs replaced with kernfs ones. cgroup->dentry w/ ->kn, cgroupfs_root->sb w/ ->kf_root. * All dentry accessors are removed. Helpers to map from kernfs constructs are added. * All vfs plumbing around dentry, inode and bdi removed. * cgroup_mount() now directly looks for matching root and then proceeds to create a new one if not found. Synchronization and object lifetime ----------------------------------- * vfs inode locking removed. Among other things, this removes the need for the convolution in cgroup_cfts_commit(). Future patches will further simplify it. * vfs refcnting replaced with cgroup internal ones. cgroup->refcnt, cgroupfs_root->refcnt added. cgroup_put_root() now directly puts root->refcnt and when it reaches zero proceeds to destroy it thus merging cgroup_put_root() and the former cgroup_kill_sb(). Simliarly, cgroup_put() now directly schedules cgroup_free_rcu() when refcnt reaches zero. * Unlike before, kernfs objects don't hold onto cgroup objects. When cgroup destroys a kernfs node, all existing operations are drained and the association is broken immediately. The same for cgroupfs_roots and mounts. * All operations which come through kernfs guarantee that the associated cgroup is and stays valid for the duration of operation; however, there are two paths which need to find out the associated cgroup from dentry without going through kernfs - css_tryget_from_dir() and cgroupstats_build(). For these two, kernfs_node->priv is RCU managed so that they can dereference it under RCU read lock. File and directory handling --------------------------- * File and directory operations converted to kernfs_ops and kernfs_syscall_ops. * xattrs is implicitly supported by kernfs. No need to worry about it from cgroup. This means that "xattr" mount option is no longer necessary. A future patch will add a deprecated warning message when sane_behavior. * When cftype->max_write_len > PAGE_SIZE, it's necessary to make a private copy of one of the kernfs_ops to set its atomic_write_len. cftype->kf_ops is added and cgroup_init/exit_cftypes() are updated to handle it. * cftype->lockdep_key added so that kernfs lockdep annotation can be per cftype. * Inidividual file entries and open states are now managed by kernfs. No need to worry about them from cgroup. cfent, cgroup_open_file and their friends are removed. * kernfs_nodes are created deactivated and kernfs_activate() invocations added to places where creation of new nodes are committed. * cgroup_rmdir() uses kernfs_[un]break_active_protection() for self-removal. v2: - Li pointed out in an earlier patch that specifying "name=" during mount without subsystem specification should succeed if there's an existing hierarchy with a matching name although it should fail with -EINVAL if a new hierarchy should be created. Prior to the conversion, this used by handled by deferring failure from NULL return from cgroup_root_from_opts(), which was necessary because root was being created before checking for existing ones. Note that cgroup_root_from_opts() returned an ERR_PTR() value for error conditions which require immediate mount failure. As we now have separate search and creation steps, deferring failure from cgroup_root_from_opts() is no longer necessary. cgroup_root_from_opts() is updated to always return ERR_PTR() value on failure. - The logic to match existing roots is updated so that a mount attempt with a matching name but different subsys_mask are rejected. This was handled by a separate matching loop under the comment "Check for name clashes with existing mounts" but got lost during conversion. Merge the check into the main search loop. - Add __rcu __force casting in RCU_INIT_POINTER() in cgroup_destroy_locked() to avoid the sparse address space warning reported by kbuild test bot. Maybe we want an explicit interface to use kn->priv as RCU protected pointer? v3: Make CONFIG_CGROUPS select CONFIG_KERNFS. v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to cgroup_idr with cgroup_mutex"). Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Cc: kbuild test robot fengguang.wu@intel.com>
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c1115
1 files changed, 362 insertions, 753 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d8efca44de5f..cda614da40cf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,9 +40,7 @@
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h>
44#include <linux/slab.h> 43#include <linux/slab.h>
45#include <linux/magic.h>
46#include <linux/spinlock.h> 44#include <linux/spinlock.h>
47#include <linux/string.h> 45#include <linux/string.h>
48#include <linux/sort.h> 46#include <linux/sort.h>
@@ -50,7 +48,6 @@
50#include <linux/delayacct.h> 48#include <linux/delayacct.h>
51#include <linux/cgroupstats.h> 49#include <linux/cgroupstats.h>
52#include <linux/hashtable.h> 50#include <linux/hashtable.h>
53#include <linux/namei.h>
54#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
55#include <linux/idr.h> 52#include <linux/idr.h>
56#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
@@ -176,7 +173,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
176static int cgroup_destroy_locked(struct cgroup *cgrp); 173static int cgroup_destroy_locked(struct cgroup *cgrp);
177static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 174static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
178 bool is_add); 175 bool is_add);
179static int cgroup_file_release(struct inode *inode, struct file *file);
180static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 176static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
181 177
182/** 178/**
@@ -209,8 +205,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
209 205
210struct cgroup_subsys_state *seq_css(struct seq_file *seq) 206struct cgroup_subsys_state *seq_css(struct seq_file *seq)
211{ 207{
212 struct cgroup_open_file *of = seq->private; 208 struct kernfs_open_file *of = seq->private;
213 return of->cfe->css; 209 struct cgroup *cgrp = of->kn->parent->priv;
210 struct cftype *cft = seq_cft(seq);
211
212 /*
213 * This is open and unprotected implementation of cgroup_css().
214 * seq_css() is only called from a kernfs file operation which has
215 * an active reference on the file. Because all the subsystem
216 * files are drained before a css is disassociated with a cgroup,
217 * the matching css from the cgroup's subsys table is guaranteed to
218 * be and stay valid until the enclosing operation is complete.
219 */
220 if (cft->ss)
221 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
222 else
223 return &cgrp->dummy_css;
214} 224}
215EXPORT_SYMBOL_GPL(seq_css); 225EXPORT_SYMBOL_GPL(seq_css);
216 226
@@ -276,21 +286,6 @@ static int notify_on_release(const struct cgroup *cgrp)
276#define for_each_active_root(root) \ 286#define for_each_active_root(root) \
277 list_for_each_entry((root), &cgroup_roots, root_list) 287 list_for_each_entry((root), &cgroup_roots, root_list)
278 288
279static inline struct cgroup *__d_cgrp(struct dentry *dentry)
280{
281 return dentry->d_fsdata;
282}
283
284static inline struct cfent *__d_cfe(struct dentry *dentry)
285{
286 return dentry->d_fsdata;
287}
288
289static inline struct cftype *__d_cft(struct dentry *dentry)
290{
291 return __d_cfe(dentry)->type;
292}
293
294/** 289/**
295 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 290 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
296 * @cgrp: the cgroup to be checked for liveness 291 * @cgrp: the cgroup to be checked for liveness
@@ -692,6 +687,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
692 return cset; 687 return cset;
693} 688}
694 689
690static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
691{
692 struct cgroup *top_cgrp = kf_root->kn->priv;
693
694 return top_cgrp->root;
695}
696
695static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 697static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
696{ 698{
697 int id; 699 int id;
@@ -730,30 +732,37 @@ static void cgroup_free_root(struct cgroupfs_root *root)
730 732
731static void cgroup_get_root(struct cgroupfs_root *root) 733static void cgroup_get_root(struct cgroupfs_root *root)
732{ 734{
733 atomic_inc(&root->sb->s_active); 735 /*
736 * The caller must ensure that @root is alive, which can be
737 * achieved by holding a ref on one of the member cgroups or
738 * following a registered reference to @root while holding
739 * cgroup_tree_mutex.
740 */
741 WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0);
742 atomic_inc(&root->refcnt);
734} 743}
735 744
736static void cgroup_put_root(struct cgroupfs_root *root) 745static void cgroup_put_root(struct cgroupfs_root *root)
737{ 746{
738 deactivate_super(root->sb);
739}
740
741static void cgroup_kill_sb(struct super_block *sb)
742{
743 struct cgroupfs_root *root = sb->s_fs_info;
744 struct cgroup *cgrp = &root->top_cgroup; 747 struct cgroup *cgrp = &root->top_cgroup;
745 struct cgrp_cset_link *link, *tmp_link; 748 struct cgrp_cset_link *link, *tmp_link;
746 int ret; 749 int ret;
747 750
748 BUG_ON(!root); 751 /*
752 * @root's refcnt reaching zero and its deregistration should be
753 * atomic w.r.t. cgroup_tree_mutex. This ensures that
754 * cgroup_get_root() is safe to invoke if @root is registered.
755 */
756 mutex_lock(&cgroup_tree_mutex);
757 if (!atomic_dec_and_test(&root->refcnt)) {
758 mutex_unlock(&cgroup_tree_mutex);
759 return;
760 }
761 mutex_lock(&cgroup_mutex);
749 762
750 BUG_ON(root->number_of_cgroups != 1); 763 BUG_ON(root->number_of_cgroups != 1);
751 BUG_ON(!list_empty(&cgrp->children)); 764 BUG_ON(!list_empty(&cgrp->children));
752 765
753 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
754 mutex_lock(&cgroup_tree_mutex);
755 mutex_lock(&cgroup_mutex);
756
757 /* Rebind all subsystems back to the default hierarchy */ 766 /* Rebind all subsystems back to the default hierarchy */
758 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { 767 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
759 ret = rebind_subsystems(root, 0, root->subsys_mask); 768 ret = rebind_subsystems(root, 0, root->subsys_mask);
@@ -783,11 +792,8 @@ static void cgroup_kill_sb(struct super_block *sb)
783 792
784 mutex_unlock(&cgroup_mutex); 793 mutex_unlock(&cgroup_mutex);
785 mutex_unlock(&cgroup_tree_mutex); 794 mutex_unlock(&cgroup_tree_mutex);
786 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
787
788 simple_xattrs_free(&cgrp->xattrs);
789 795
790 kill_litter_super(sb); 796 kernfs_destroy_root(root->kf_root);
791 cgroup_free_root(root); 797 cgroup_free_root(root);
792} 798}
793 799
@@ -878,42 +884,10 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
878 * update of a tasks cgroup pointer by cgroup_attach_task() 884 * update of a tasks cgroup pointer by cgroup_attach_task()
879 */ 885 */
880 886
881/*
882 * A couple of forward declarations required, due to cyclic reference loop:
883 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
884 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
885 * -> cgroup_mkdir.
886 */
887
888static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
889static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
890static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 887static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
891static const struct inode_operations cgroup_dir_inode_operations; 888static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
892static const struct file_operations proc_cgroupstats_operations; 889static const struct file_operations proc_cgroupstats_operations;
893 890
894static struct backing_dev_info cgroup_backing_dev_info = {
895 .name = "cgroup",
896 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
897};
898
899static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
900{
901 struct inode *inode = new_inode(sb);
902
903 if (inode) {
904 do {
905 /* ino 0 is reserved for dummy_root */
906 inode->i_ino = get_next_ino();
907 } while (!inode->i_ino);
908 inode->i_mode = mode;
909 inode->i_uid = current_fsuid();
910 inode->i_gid = current_fsgid();
911 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
912 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
913 }
914 return inode;
915}
916
917static struct cgroup_name *cgroup_alloc_name(const char *name_str) 891static struct cgroup_name *cgroup_alloc_name(const char *name_str)
918{ 892{
919 struct cgroup_name *name; 893 struct cgroup_name *name;
@@ -983,8 +957,6 @@ static void cgroup_free_fn(struct work_struct *work)
983 957
984 cgroup_pidlist_destroy_all(cgrp); 958 cgroup_pidlist_destroy_all(cgrp);
985 959
986 simple_xattrs_free(&cgrp->xattrs);
987
988 kfree(rcu_dereference_raw(cgrp->name)); 960 kfree(rcu_dereference_raw(cgrp->name));
989 kfree(cgrp); 961 kfree(cgrp);
990} 962}
@@ -999,81 +971,38 @@ static void cgroup_free_rcu(struct rcu_head *head)
999 971
1000static void cgroup_get(struct cgroup *cgrp) 972static void cgroup_get(struct cgroup *cgrp)
1001{ 973{
1002 dget(cgrp->dentry); 974 WARN_ON_ONCE(cgroup_is_dead(cgrp));
1003} 975 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
1004 976 atomic_inc(&cgrp->refcnt);
1005static void cgroup_diput(struct dentry *dentry, struct inode *inode)
1006{
1007 /* is dentry a directory ? if so, kfree() associated cgroup */
1008 if (S_ISDIR(inode->i_mode)) {
1009 struct cgroup *cgrp = dentry->d_fsdata;
1010
1011 BUG_ON(!(cgroup_is_dead(cgrp)));
1012
1013 /*
1014 * XXX: cgrp->id is only used to look up css's. As cgroup
1015 * and css's lifetimes will be decoupled, it should be made
1016 * per-subsystem and moved to css->id so that lookups are
1017 * successful until the target css is released.
1018 */
1019 mutex_lock(&cgroup_mutex);
1020 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
1021 mutex_unlock(&cgroup_mutex);
1022 cgrp->id = -1;
1023
1024 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
1025 } else {
1026 struct cfent *cfe = __d_cfe(dentry);
1027 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
1028
1029 WARN_ONCE(!list_empty(&cfe->node) &&
1030 cgrp != &cgrp->root->top_cgroup,
1031 "cfe still linked for %s\n", cfe->type->name);
1032 simple_xattrs_free(&cfe->xattrs);
1033 kfree(cfe);
1034 }
1035 iput(inode);
1036} 977}
1037 978
1038static void cgroup_put(struct cgroup *cgrp) 979static void cgroup_put(struct cgroup *cgrp)
1039{ 980{
1040 dput(cgrp->dentry); 981 if (!atomic_dec_and_test(&cgrp->refcnt))
1041} 982 return;
983 if (WARN_ON_ONCE(!cgroup_is_dead(cgrp)))
984 return;
1042 985
1043static void remove_dir(struct dentry *d) 986 /*
1044{ 987 * XXX: cgrp->id is only used to look up css's. As cgroup and
1045 struct dentry *parent = dget(d->d_parent); 988 * css's lifetimes will be decoupled, it should be made
989 * per-subsystem and moved to css->id so that lookups are
990 * successful until the target css is released.
991 */
992 mutex_lock(&cgroup_mutex);
993 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
994 mutex_unlock(&cgroup_mutex);
995 cgrp->id = -1;
1046 996
1047 d_delete(d); 997 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
1048 simple_rmdir(parent->d_inode, d);
1049 dput(parent);
1050} 998}
1051 999
1052static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 1000static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1053{ 1001{
1054 struct cfent *cfe; 1002 char name[CGROUP_FILE_NAME_MAX];
1055 1003
1056 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
1057 lockdep_assert_held(&cgroup_tree_mutex); 1004 lockdep_assert_held(&cgroup_tree_mutex);
1058 1005 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1059 /*
1060 * If we're doing cleanup due to failure of cgroup_create(),
1061 * the corresponding @cfe may not exist.
1062 */
1063 list_for_each_entry(cfe, &cgrp->files, node) {
1064 struct dentry *d = cfe->dentry;
1065
1066 if (cft && cfe->type != cft)
1067 continue;
1068
1069 dget(d);
1070 d_delete(d);
1071 simple_unlink(cgrp->dentry->d_inode, d);
1072 list_del_init(&cfe->node);
1073 dput(d);
1074
1075 break;
1076 }
1077} 1006}
1078 1007
1079/** 1008/**
@@ -1096,22 +1025,6 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
1096 } 1025 }
1097} 1026}
1098 1027
1099/*
1100 * NOTE : the dentry must have been dget()'ed
1101 */
1102static void cgroup_d_remove_dir(struct dentry *dentry)
1103{
1104 struct dentry *parent;
1105
1106 parent = dentry->d_parent;
1107 spin_lock(&parent->d_lock);
1108 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1109 list_del_init(&dentry->d_u.d_child);
1110 spin_unlock(&dentry->d_lock);
1111 spin_unlock(&parent->d_lock);
1112 remove_dir(dentry);
1113}
1114
1115static int rebind_subsystems(struct cgroupfs_root *root, 1028static int rebind_subsystems(struct cgroupfs_root *root,
1116 unsigned long added_mask, unsigned removed_mask) 1029 unsigned long added_mask, unsigned removed_mask)
1117{ 1030{
@@ -1179,13 +1092,15 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1179 * now matches the bound subsystems. 1092 * now matches the bound subsystems.
1180 */ 1093 */
1181 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1094 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1095 kernfs_activate(cgrp->kn);
1182 1096
1183 return 0; 1097 return 0;
1184} 1098}
1185 1099
1186static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1100static int cgroup_show_options(struct seq_file *seq,
1101 struct kernfs_root *kf_root)
1187{ 1102{
1188 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1103 struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
1189 struct cgroup_subsys *ss; 1104 struct cgroup_subsys *ss;
1190 int ssid; 1105 int ssid;
1191 1106
@@ -1219,9 +1134,6 @@ struct cgroup_sb_opts {
1219 char *name; 1134 char *name;
1220 /* User explicitly requested empty subsystem */ 1135 /* User explicitly requested empty subsystem */
1221 bool none; 1136 bool none;
1222
1223 struct cgroupfs_root *new_root;
1224
1225}; 1137};
1226 1138
1227/* 1139/*
@@ -1380,11 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1380 return 0; 1292 return 0;
1381} 1293}
1382 1294
1383static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1295static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1384{ 1296{
1385 int ret = 0; 1297 int ret = 0;
1386 struct cgroupfs_root *root = sb->s_fs_info; 1298 struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
1387 struct cgroup *cgrp = &root->top_cgroup;
1388 struct cgroup_sb_opts opts; 1299 struct cgroup_sb_opts opts;
1389 unsigned long added_mask, removed_mask; 1300 unsigned long added_mask, removed_mask;
1390 1301
@@ -1393,7 +1304,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1393 return -EINVAL; 1304 return -EINVAL;
1394 } 1305 }
1395 1306
1396 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1397 mutex_lock(&cgroup_tree_mutex); 1307 mutex_lock(&cgroup_tree_mutex);
1398 mutex_lock(&cgroup_mutex); 1308 mutex_lock(&cgroup_mutex);
1399 1309
@@ -1439,34 +1349,26 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1439 kfree(opts.name); 1349 kfree(opts.name);
1440 mutex_unlock(&cgroup_mutex); 1350 mutex_unlock(&cgroup_mutex);
1441 mutex_unlock(&cgroup_tree_mutex); 1351 mutex_unlock(&cgroup_tree_mutex);
1442 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1443 return ret; 1352 return ret;
1444} 1353}
1445 1354
1446static const struct super_operations cgroup_ops = {
1447 .statfs = simple_statfs,
1448 .drop_inode = generic_delete_inode,
1449 .show_options = cgroup_show_options,
1450 .remount_fs = cgroup_remount,
1451};
1452
1453static void init_cgroup_housekeeping(struct cgroup *cgrp) 1355static void init_cgroup_housekeeping(struct cgroup *cgrp)
1454{ 1356{
1357 atomic_set(&cgrp->refcnt, 1);
1455 INIT_LIST_HEAD(&cgrp->sibling); 1358 INIT_LIST_HEAD(&cgrp->sibling);
1456 INIT_LIST_HEAD(&cgrp->children); 1359 INIT_LIST_HEAD(&cgrp->children);
1457 INIT_LIST_HEAD(&cgrp->files);
1458 INIT_LIST_HEAD(&cgrp->cset_links); 1360 INIT_LIST_HEAD(&cgrp->cset_links);
1459 INIT_LIST_HEAD(&cgrp->release_list); 1361 INIT_LIST_HEAD(&cgrp->release_list);
1460 INIT_LIST_HEAD(&cgrp->pidlists); 1362 INIT_LIST_HEAD(&cgrp->pidlists);
1461 mutex_init(&cgrp->pidlist_mutex); 1363 mutex_init(&cgrp->pidlist_mutex);
1462 cgrp->dummy_css.cgroup = cgrp; 1364 cgrp->dummy_css.cgroup = cgrp;
1463 simple_xattrs_init(&cgrp->xattrs);
1464} 1365}
1465 1366
1466static void init_cgroup_root(struct cgroupfs_root *root) 1367static void init_cgroup_root(struct cgroupfs_root *root)
1467{ 1368{
1468 struct cgroup *cgrp = &root->top_cgroup; 1369 struct cgroup *cgrp = &root->top_cgroup;
1469 1370
1371 atomic_set(&root->refcnt, 1);
1470 INIT_LIST_HEAD(&root->root_list); 1372 INIT_LIST_HEAD(&root->root_list);
1471 root->number_of_cgroups = 1; 1373 root->number_of_cgroups = 1;
1472 cgrp->root = root; 1374 cgrp->root = root;
@@ -1475,32 +1377,12 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1475 idr_init(&root->cgroup_idr); 1377 idr_init(&root->cgroup_idr);
1476} 1378}
1477 1379
1478static int cgroup_test_super(struct super_block *sb, void *data)
1479{
1480 struct cgroup_sb_opts *opts = data;
1481 struct cgroupfs_root *root = sb->s_fs_info;
1482
1483 /* If we asked for a name then it must match */
1484 if (opts->name && strcmp(opts->name, root->name))
1485 return 0;
1486
1487 /*
1488 * If we asked for subsystems (or explicitly for no
1489 * subsystems) then they must match
1490 */
1491 if ((opts->subsys_mask || opts->none)
1492 && (opts->subsys_mask != root->subsys_mask))
1493 return 0;
1494
1495 return 1;
1496}
1497
1498static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) 1380static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1499{ 1381{
1500 struct cgroupfs_root *root; 1382 struct cgroupfs_root *root;
1501 1383
1502 if (!opts->subsys_mask && !opts->none) 1384 if (!opts->subsys_mask && !opts->none)
1503 return NULL; 1385 return ERR_PTR(-EINVAL);
1504 1386
1505 root = kzalloc(sizeof(*root), GFP_KERNEL); 1387 root = kzalloc(sizeof(*root), GFP_KERNEL);
1506 if (!root) 1388 if (!root)
@@ -1527,99 +1409,21 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1527 return root; 1409 return root;
1528} 1410}
1529 1411
1530static int cgroup_set_super(struct super_block *sb, void *data)
1531{
1532 int ret;
1533 struct cgroup_sb_opts *opts = data;
1534
1535 /* If we don't have a new root, we can't set up a new sb */
1536 if (!opts->new_root)
1537 return -EINVAL;
1538
1539 BUG_ON(!opts->subsys_mask && !opts->none);
1540
1541 ret = set_anon_super(sb, NULL);
1542 if (ret)
1543 return ret;
1544
1545 sb->s_fs_info = opts->new_root;
1546 opts->new_root->sb = sb;
1547
1548 sb->s_blocksize = PAGE_CACHE_SIZE;
1549 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1550 sb->s_magic = CGROUP_SUPER_MAGIC;
1551 sb->s_op = &cgroup_ops;
1552
1553 return 0;
1554}
1555
1556static int cgroup_get_rootdir(struct super_block *sb)
1557{
1558 static const struct dentry_operations cgroup_dops = {
1559 .d_iput = cgroup_diput,
1560 .d_delete = always_delete_dentry,
1561 };
1562
1563 struct inode *inode =
1564 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1565
1566 if (!inode)
1567 return -ENOMEM;
1568
1569 inode->i_fop = &simple_dir_operations;
1570 inode->i_op = &cgroup_dir_inode_operations;
1571 /* directories start off with i_nlink == 2 (for "." entry) */
1572 inc_nlink(inode);
1573 sb->s_root = d_make_root(inode);
1574 if (!sb->s_root)
1575 return -ENOMEM;
1576 /* for everything else we want ->d_op set */
1577 sb->s_d_op = &cgroup_dops;
1578 return 0;
1579}
1580
1581static int cgroup_setup_root(struct cgroupfs_root *root) 1412static int cgroup_setup_root(struct cgroupfs_root *root)
1582{ 1413{
1583 LIST_HEAD(tmp_links); 1414 LIST_HEAD(tmp_links);
1584 struct super_block *sb = root->sb;
1585 struct cgroup *root_cgrp = &root->top_cgroup; 1415 struct cgroup *root_cgrp = &root->top_cgroup;
1586 struct cgroupfs_root *existing_root;
1587 struct css_set *cset; 1416 struct css_set *cset;
1588 struct inode *inode;
1589 const struct cred *cred;
1590 int i, ret; 1417 int i, ret;
1591 1418
1592 lockdep_assert_held(&cgroup_tree_mutex); 1419 lockdep_assert_held(&cgroup_tree_mutex);
1593 lockdep_assert_held(&cgroup_mutex); 1420 lockdep_assert_held(&cgroup_mutex);
1594 BUG_ON(sb->s_root != NULL);
1595
1596 mutex_unlock(&cgroup_mutex);
1597 mutex_unlock(&cgroup_tree_mutex);
1598
1599 ret = cgroup_get_rootdir(sb);
1600 if (ret) {
1601 mutex_lock(&cgroup_tree_mutex);
1602 mutex_lock(&cgroup_mutex);
1603 return ret;
1604 }
1605 inode = sb->s_root->d_inode;
1606
1607 mutex_lock(&inode->i_mutex);
1608 mutex_lock(&cgroup_tree_mutex);
1609 mutex_lock(&cgroup_mutex);
1610 1421
1611 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); 1422 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1612 if (ret < 0) 1423 if (ret < 0)
1613 goto out_unlock; 1424 goto out;
1614 root_cgrp->id = ret; 1425 root_cgrp->id = ret;
1615 1426
1616 /* check for name clashes with existing mounts */
1617 ret = -EBUSY;
1618 if (strlen(root->name))
1619 for_each_active_root(existing_root)
1620 if (!strcmp(existing_root->name, root->name))
1621 goto out_unlock;
1622
1623 /* 1427 /*
1624 * We're accessing css_set_count without locking css_set_lock here, 1428 * We're accessing css_set_count without locking css_set_lock here,
1625 * but that's OK - it can only be increased by someone holding 1429 * but that's OK - it can only be increased by someone holding
@@ -1628,34 +1432,29 @@ static int cgroup_setup_root(struct cgroupfs_root *root)
1628 */ 1432 */
1629 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); 1433 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1630 if (ret) 1434 if (ret)
1631 goto out_unlock; 1435 goto out;
1632 1436
1633 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ 1437 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1634 ret = cgroup_init_root_id(root, 2, 0); 1438 ret = cgroup_init_root_id(root, 2, 0);
1635 if (ret) 1439 if (ret)
1636 goto out_unlock; 1440 goto out;
1637
1638 sb->s_root->d_fsdata = root_cgrp;
1639 root_cgrp->dentry = sb->s_root;
1640 1441
1641 /* 1442 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1642 * We're inside get_sb() and will call lookup_one_len() to create 1443 KERNFS_ROOT_CREATE_DEACTIVATED,
1643 * the root files, which doesn't work if SELinux is in use. The 1444 root_cgrp);
1644 * following cred dancing somehow works around it. See 2ce9738ba 1445 if (IS_ERR(root->kf_root)) {
1645 * ("cgroupfs: use init_cred when populating new cgroupfs mount") 1446 ret = PTR_ERR(root->kf_root);
1646 * for more details. 1447 goto exit_root_id;
1647 */ 1448 }
1648 cred = override_creds(&init_cred); 1449 root_cgrp->kn = root->kf_root->kn;
1649 1450
1650 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); 1451 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1651 if (ret) 1452 if (ret)
1652 goto rm_base_files; 1453 goto destroy_root;
1653 1454
1654 ret = rebind_subsystems(root, root->subsys_mask, 0); 1455 ret = rebind_subsystems(root, root->subsys_mask, 0);
1655 if (ret) 1456 if (ret)
1656 goto rm_base_files; 1457 goto destroy_root;
1657
1658 revert_creds(cred);
1659 1458
1660 /* 1459 /*
1661 * There must be no failure case after here, since rebinding takes 1460 * There must be no failure case after here, since rebinding takes
@@ -1677,15 +1476,16 @@ static int cgroup_setup_root(struct cgroupfs_root *root)
1677 BUG_ON(!list_empty(&root_cgrp->children)); 1476 BUG_ON(!list_empty(&root_cgrp->children));
1678 BUG_ON(root->number_of_cgroups != 1); 1477 BUG_ON(root->number_of_cgroups != 1);
1679 1478
1479 kernfs_activate(root_cgrp->kn);
1680 ret = 0; 1480 ret = 0;
1681 goto out_unlock; 1481 goto out;
1682 1482
1683rm_base_files: 1483destroy_root:
1684 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); 1484 kernfs_destroy_root(root->kf_root);
1685 revert_creds(cred); 1485 root->kf_root = NULL;
1486exit_root_id:
1686 cgroup_exit_root_id(root); 1487 cgroup_exit_root_id(root);
1687out_unlock: 1488out:
1688 mutex_unlock(&inode->i_mutex);
1689 free_cgrp_cset_links(&tmp_links); 1489 free_cgrp_cset_links(&tmp_links);
1690 return ret; 1490 return ret;
1691} 1491}
@@ -1694,10 +1494,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1694 int flags, const char *unused_dev_name, 1494 int flags, const char *unused_dev_name,
1695 void *data) 1495 void *data)
1696{ 1496{
1697 struct super_block *sb = NULL; 1497 struct cgroupfs_root *root;
1698 struct cgroupfs_root *root = NULL;
1699 struct cgroup_sb_opts opts; 1498 struct cgroup_sb_opts opts;
1700 struct cgroupfs_root *new_root; 1499 struct dentry *dentry;
1701 int ret; 1500 int ret;
1702 1501
1703 mutex_lock(&cgroup_tree_mutex); 1502 mutex_lock(&cgroup_tree_mutex);
@@ -1708,41 +1507,32 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1708 if (ret) 1507 if (ret)
1709 goto out_unlock; 1508 goto out_unlock;
1710 1509
1711 /* 1510 /* look for a matching existing root */
1712 * Allocate a new cgroup root. We may not need it if we're 1511 for_each_active_root(root) {
1713 * reusing an existing hierarchy. 1512 bool name_match = false;
1714 */
1715 new_root = cgroup_root_from_opts(&opts);
1716 if (IS_ERR(new_root)) {
1717 ret = PTR_ERR(new_root);
1718 goto out_unlock;
1719 }
1720 opts.new_root = new_root;
1721 1513
1722 /* Locate an existing or new sb for this hierarchy */ 1514 /*
1723 mutex_unlock(&cgroup_mutex); 1515 * If we asked for a name then it must match. Also, if
1724 mutex_unlock(&cgroup_tree_mutex); 1516 * name matches but sybsys_mask doesn't, we should fail.
1725 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1517 * Remember whether name matched.
1726 mutex_lock(&cgroup_tree_mutex); 1518 */
1727 mutex_lock(&cgroup_mutex); 1519 if (opts.name) {
1728 if (IS_ERR(sb)) { 1520 if (strcmp(opts.name, root->name))
1729 ret = PTR_ERR(sb); 1521 continue;
1730 cgroup_free_root(opts.new_root); 1522 name_match = true;
1731 goto out_unlock; 1523 }
1732 }
1733 1524
1734 root = sb->s_fs_info;
1735 BUG_ON(!root);
1736 if (root == opts.new_root) {
1737 ret = cgroup_setup_root(root);
1738 if (ret)
1739 goto out_unlock;
1740 } else {
1741 /* 1525 /*
1742 * We re-used an existing hierarchy - the new root (if 1526 * If we asked for subsystems (or explicitly for no
1743 * any) is not needed 1527 * subsystems) then they must match.
1744 */ 1528 */
1745 cgroup_free_root(opts.new_root); 1529 if ((opts.subsys_mask || opts.none) &&
1530 (opts.subsys_mask != root->subsys_mask)) {
1531 if (!name_match)
1532 continue;
1533 ret = -EBUSY;
1534 goto out_unlock;
1535 }
1746 1536
1747 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1537 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1748 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1538 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
@@ -1753,23 +1543,45 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1753 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1543 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1754 } 1544 }
1755 } 1545 }
1546
1547 cgroup_get_root(root);
1548 goto out_unlock;
1756 } 1549 }
1757 1550
1758 ret = 0; 1551 /* no such thing, create a new one */
1552 root = cgroup_root_from_opts(&opts);
1553 if (IS_ERR(root)) {
1554 ret = PTR_ERR(root);
1555 goto out_unlock;
1556 }
1557
1558 ret = cgroup_setup_root(root);
1559 if (ret)
1560 cgroup_free_root(root);
1561
1759out_unlock: 1562out_unlock:
1760 mutex_unlock(&cgroup_mutex); 1563 mutex_unlock(&cgroup_mutex);
1761 mutex_unlock(&cgroup_tree_mutex); 1564 mutex_unlock(&cgroup_tree_mutex);
1762 1565
1763 if (ret && !IS_ERR_OR_NULL(sb))
1764 deactivate_locked_super(sb);
1765
1766 kfree(opts.release_agent); 1566 kfree(opts.release_agent);
1767 kfree(opts.name); 1567 kfree(opts.name);
1768 1568
1769 if (!ret) 1569 if (ret)
1770 return dget(sb->s_root);
1771 else
1772 return ERR_PTR(ret); 1570 return ERR_PTR(ret);
1571
1572 dentry = kernfs_mount(fs_type, flags, root->kf_root);
1573 if (IS_ERR(dentry))
1574 cgroup_put_root(root);
1575 return dentry;
1576}
1577
1578static void cgroup_kill_sb(struct super_block *sb)
1579{
1580 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1581 struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
1582
1583 cgroup_put_root(root);
1584 kernfs_kill_sb(sb);
1773} 1585}
1774 1586
1775static struct file_system_type cgroup_fs_type = { 1587static struct file_system_type cgroup_fs_type = {
@@ -2301,29 +2113,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2301 return 0; 2113 return 0;
2302} 2114}
2303 2115
2304static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, 2116static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2305 size_t nbytes, loff_t *ppos) 2117 size_t nbytes, loff_t off)
2306{ 2118{
2307 struct cfent *cfe = __d_cfe(file->f_dentry); 2119 struct cgroup *cgrp = of->kn->parent->priv;
2308 struct cftype *cft = __d_cft(file->f_dentry); 2120 struct cftype *cft = of->kn->priv;
2309 struct cgroup_subsys_state *css = cfe->css; 2121 struct cgroup_subsys_state *css;
2310 size_t max_bytes = max(cft->max_write_len, PAGE_SIZE);
2311 char *buf;
2312 int ret; 2122 int ret;
2313 2123
2314 if (nbytes > max_bytes) 2124 /*
2315 return -E2BIG; 2125 * kernfs guarantees that a file isn't deleted with operations in
2316 2126 * flight, which means that the matching css is and stays alive and
2317 buf = kmalloc(nbytes + 1, GFP_KERNEL); 2127 * doesn't need to be pinned. The RCU locking is not necessary
2318 if (!buf) 2128 * either. It's just for the convenience of using cgroup_css().
2319 return -ENOMEM; 2129 */
2320 2130 rcu_read_lock();
2321 if (copy_from_user(buf, userbuf, nbytes)) { 2131 css = cgroup_css(cgrp, cft->ss);
2322 ret = -EFAULT; 2132 rcu_read_unlock();
2323 goto out_free;
2324 }
2325
2326 buf[nbytes] = '\0';
2327 2133
2328 if (cft->write_string) { 2134 if (cft->write_string) {
2329 ret = cft->write_string(css, cft, strstrip(buf)); 2135 ret = cft->write_string(css, cft, strstrip(buf));
@@ -2342,53 +2148,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2342 } else { 2148 } else {
2343 ret = -EINVAL; 2149 ret = -EINVAL;
2344 } 2150 }
2345out_free: 2151
2346 kfree(buf);
2347 return ret ?: nbytes; 2152 return ret ?: nbytes;
2348} 2153}
2349 2154
2350/*
2351 * seqfile ops/methods for returning structured data. Currently just
2352 * supports string->u64 maps, but can be extended in future.
2353 */
2354
2355static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2155static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2356{ 2156{
2357 struct cftype *cft = seq_cft(seq); 2157 return seq_cft(seq)->seq_start(seq, ppos);
2358
2359 if (cft->seq_start) {
2360 return cft->seq_start(seq, ppos);
2361 } else {
2362 /*
2363 * The same behavior and code as single_open(). Returns
2364 * !NULL if pos is at the beginning; otherwise, NULL.
2365 */
2366 return NULL + !*ppos;
2367 }
2368} 2158}
2369 2159
2370static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2160static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2371{ 2161{
2372 struct cftype *cft = seq_cft(seq); 2162 return seq_cft(seq)->seq_next(seq, v, ppos);
2373
2374 if (cft->seq_next) {
2375 return cft->seq_next(seq, v, ppos);
2376 } else {
2377 /*
2378 * The same behavior and code as single_open(), always
2379 * terminate after the initial read.
2380 */
2381 ++*ppos;
2382 return NULL;
2383 }
2384} 2163}
2385 2164
2386static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2165static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2387{ 2166{
2388 struct cftype *cft = seq_cft(seq); 2167 seq_cft(seq)->seq_stop(seq, v);
2389
2390 if (cft->seq_stop)
2391 cft->seq_stop(seq, v);
2392} 2168}
2393 2169
2394static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2170static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2408,96 +2184,36 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2408 return 0; 2184 return 0;
2409} 2185}
2410 2186
2411static struct seq_operations cgroup_seq_operations = { 2187static struct kernfs_ops cgroup_kf_single_ops = {
2412 .start = cgroup_seqfile_start, 2188 .atomic_write_len = PAGE_SIZE,
2413 .next = cgroup_seqfile_next, 2189 .write = cgroup_file_write,
2414 .stop = cgroup_seqfile_stop, 2190 .seq_show = cgroup_seqfile_show,
2415 .show = cgroup_seqfile_show,
2416}; 2191};
2417 2192
2418static int cgroup_file_open(struct inode *inode, struct file *file) 2193static struct kernfs_ops cgroup_kf_ops = {
2419{ 2194 .atomic_write_len = PAGE_SIZE,
2420 struct cfent *cfe = __d_cfe(file->f_dentry); 2195 .write = cgroup_file_write,
2421 struct cftype *cft = __d_cft(file->f_dentry); 2196 .seq_start = cgroup_seqfile_start,
2422 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2197 .seq_next = cgroup_seqfile_next,
2423 struct cgroup_subsys_state *css; 2198 .seq_stop = cgroup_seqfile_stop,
2424 struct cgroup_open_file *of; 2199 .seq_show = cgroup_seqfile_show,
2425 int err; 2200};
2426
2427 err = generic_file_open(inode, file);
2428 if (err)
2429 return err;
2430
2431 /*
2432 * If the file belongs to a subsystem, pin the css. Will be
2433 * unpinned either on open failure or release. This ensures that
2434 * @css stays alive for all file operations.
2435 */
2436 rcu_read_lock();
2437 css = cgroup_css(cgrp, cft->ss);
2438 if (cft->ss && !css_tryget(css))
2439 css = NULL;
2440 rcu_read_unlock();
2441
2442 if (!css)
2443 return -ENODEV;
2444
2445 /*
2446 * @cfe->css is used by read/write/close to determine the
2447 * associated css. @file->private_data would be a better place but
2448 * that's already used by seqfile. Multiple accessors may use it
2449 * simultaneously which is okay as the association never changes.
2450 */
2451 WARN_ON_ONCE(cfe->css && cfe->css != css);
2452 cfe->css = css;
2453
2454 of = __seq_open_private(file, &cgroup_seq_operations,
2455 sizeof(struct cgroup_open_file));
2456 if (of) {
2457 of->cfe = cfe;
2458 return 0;
2459 }
2460
2461 if (css->ss)
2462 css_put(css);
2463 return -ENOMEM;
2464}
2465
2466static int cgroup_file_release(struct inode *inode, struct file *file)
2467{
2468 struct cfent *cfe = __d_cfe(file->f_dentry);
2469 struct cgroup_subsys_state *css = cfe->css;
2470
2471 if (css->ss)
2472 css_put(css);
2473 return seq_release_private(inode, file);
2474}
2475 2201
2476/* 2202/*
2477 * cgroup_rename - Only allow simple rename of directories in place. 2203 * cgroup_rename - Only allow simple rename of directories in place.
2478 */ 2204 */
2479static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2205static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2480 struct inode *new_dir, struct dentry *new_dentry) 2206 const char *new_name_str)
2481{ 2207{
2482 int ret; 2208 struct cgroup *cgrp = kn->priv;
2483 struct cgroup_name *name, *old_name; 2209 struct cgroup_name *name, *old_name;
2484 struct cgroup *cgrp; 2210 int ret;
2485
2486 /*
2487 * It's convinient to use parent dir's i_mutex to protected
2488 * cgrp->name.
2489 */
2490 lockdep_assert_held(&old_dir->i_mutex);
2491 2211
2492 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2212 if (kernfs_type(kn) != KERNFS_DIR)
2493 return -ENOTDIR; 2213 return -ENOTDIR;
2494 if (new_dentry->d_inode) 2214 if (kn->parent != new_parent)
2495 return -EEXIST;
2496 if (old_dir != new_dir)
2497 return -EIO; 2215 return -EIO;
2498 2216
2499 cgrp = __d_cgrp(old_dentry);
2500
2501 /* 2217 /*
2502 * This isn't a proper migration and its usefulness is very 2218 * This isn't a proper migration and its usefulness is very
2503 * limited. Disallow if sane_behavior. 2219 * limited. Disallow if sane_behavior.
@@ -2505,186 +2221,43 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2505 if (cgroup_sane_behavior(cgrp)) 2221 if (cgroup_sane_behavior(cgrp))
2506 return -EPERM; 2222 return -EPERM;
2507 2223
2508 name = cgroup_alloc_name(new_dentry->d_name.name); 2224 name = cgroup_alloc_name(new_name_str);
2509 if (!name) 2225 if (!name)
2510 return -ENOMEM; 2226 return -ENOMEM;
2511 2227
2512 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2228 mutex_lock(&cgroup_tree_mutex);
2513 if (ret) { 2229 mutex_lock(&cgroup_mutex);
2514 kfree(name); 2230
2515 return ret; 2231 ret = kernfs_rename(kn, new_parent, new_name_str);
2232 if (!ret) {
2233 old_name = rcu_dereference_protected(cgrp->name, true);
2234 rcu_assign_pointer(cgrp->name, name);
2235 } else {
2236 old_name = name;
2516 } 2237 }
2517 2238
2518 old_name = rcu_dereference_protected(cgrp->name, true); 2239 mutex_unlock(&cgroup_mutex);
2519 rcu_assign_pointer(cgrp->name, name); 2240 mutex_unlock(&cgroup_tree_mutex);
2520 2241
2521 kfree_rcu(old_name, rcu_head); 2242 kfree_rcu(old_name, rcu_head);
2522 return 0; 2243 return ret;
2523}
2524
2525static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2526{
2527 if (S_ISDIR(dentry->d_inode->i_mode))
2528 return &__d_cgrp(dentry)->xattrs;
2529 else
2530 return &__d_cfe(dentry)->xattrs;
2531}
2532
2533static inline int xattr_enabled(struct dentry *dentry)
2534{
2535 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2536 return root->flags & CGRP_ROOT_XATTR;
2537}
2538
2539static bool is_valid_xattr(const char *name)
2540{
2541 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2542 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2543 return true;
2544 return false;
2545}
2546
2547static int cgroup_setxattr(struct dentry *dentry, const char *name,
2548 const void *val, size_t size, int flags)
2549{
2550 if (!xattr_enabled(dentry))
2551 return -EOPNOTSUPP;
2552 if (!is_valid_xattr(name))
2553 return -EINVAL;
2554 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2555}
2556
2557static int cgroup_removexattr(struct dentry *dentry, const char *name)
2558{
2559 if (!xattr_enabled(dentry))
2560 return -EOPNOTSUPP;
2561 if (!is_valid_xattr(name))
2562 return -EINVAL;
2563 return simple_xattr_remove(__d_xattrs(dentry), name);
2564}
2565
2566static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2567 void *buf, size_t size)
2568{
2569 if (!xattr_enabled(dentry))
2570 return -EOPNOTSUPP;
2571 if (!is_valid_xattr(name))
2572 return -EINVAL;
2573 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2574}
2575
2576static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2577{
2578 if (!xattr_enabled(dentry))
2579 return -EOPNOTSUPP;
2580 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2581}
2582
2583static const struct file_operations cgroup_file_operations = {
2584 .read = seq_read,
2585 .write = cgroup_file_write,
2586 .llseek = generic_file_llseek,
2587 .open = cgroup_file_open,
2588 .release = cgroup_file_release,
2589};
2590
2591static const struct inode_operations cgroup_file_inode_operations = {
2592 .setxattr = cgroup_setxattr,
2593 .getxattr = cgroup_getxattr,
2594 .listxattr = cgroup_listxattr,
2595 .removexattr = cgroup_removexattr,
2596};
2597
2598static const struct inode_operations cgroup_dir_inode_operations = {
2599 .lookup = simple_lookup,
2600 .mkdir = cgroup_mkdir,
2601 .rmdir = cgroup_rmdir,
2602 .rename = cgroup_rename,
2603 .setxattr = cgroup_setxattr,
2604 .getxattr = cgroup_getxattr,
2605 .listxattr = cgroup_listxattr,
2606 .removexattr = cgroup_removexattr,
2607};
2608
2609static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2610 struct super_block *sb)
2611{
2612 struct inode *inode;
2613
2614 if (!dentry)
2615 return -ENOENT;
2616 if (dentry->d_inode)
2617 return -EEXIST;
2618
2619 inode = cgroup_new_inode(mode, sb);
2620 if (!inode)
2621 return -ENOMEM;
2622
2623 if (S_ISDIR(mode)) {
2624 inode->i_op = &cgroup_dir_inode_operations;
2625 inode->i_fop = &simple_dir_operations;
2626
2627 /* start off with i_nlink == 2 (for "." entry) */
2628 inc_nlink(inode);
2629 inc_nlink(dentry->d_parent->d_inode);
2630
2631 /*
2632 * Control reaches here with cgroup_mutex held.
2633 * @inode->i_mutex should nest outside cgroup_mutex but we
2634 * want to populate it immediately without releasing
2635 * cgroup_mutex. As @inode isn't visible to anyone else
2636 * yet, trylock will always succeed without affecting
2637 * lockdep checks.
2638 */
2639 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2640 } else if (S_ISREG(mode)) {
2641 inode->i_size = 0;
2642 inode->i_fop = &cgroup_file_operations;
2643 inode->i_op = &cgroup_file_inode_operations;
2644 }
2645 d_instantiate(dentry, inode);
2646 dget(dentry); /* Extra count - pin the dentry in core */
2647 return 0;
2648} 2244}
2649 2245
2650static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 2246static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2651{ 2247{
2652 struct dentry *dir = cgrp->dentry;
2653 struct cgroup *parent = __d_cgrp(dir);
2654 struct dentry *dentry;
2655 struct cfent *cfe;
2656 int error;
2657 umode_t mode;
2658 char name[CGROUP_FILE_NAME_MAX]; 2248 char name[CGROUP_FILE_NAME_MAX];
2249 struct kernfs_node *kn;
2250 struct lock_class_key *key = NULL;
2659 2251
2660 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2252#ifdef CONFIG_DEBUG_LOCK_ALLOC
2661 2253 key = &cft->lockdep_key;
2662 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); 2254#endif
2663 if (!cfe) 2255 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2664 return -ENOMEM; 2256 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2665 2257 NULL, false, key);
2666 cgroup_file_name(cgrp, cft, name); 2258 if (IS_ERR(kn))
2667 dentry = lookup_one_len(name, dir, strlen(name)); 2259 return PTR_ERR(kn);
2668 if (IS_ERR(dentry)) { 2260 return 0;
2669 error = PTR_ERR(dentry);
2670 goto out;
2671 }
2672
2673 cfe->type = (void *)cft;
2674 cfe->dentry = dentry;
2675 dentry->d_fsdata = cfe;
2676 simple_xattrs_init(&cfe->xattrs);
2677
2678 mode = cgroup_file_mode(cft);
2679 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2680 if (!error) {
2681 list_add_tail(&cfe->node, &parent->files);
2682 cfe = NULL;
2683 }
2684 dput(dentry);
2685out:
2686 kfree(cfe);
2687 return error;
2688} 2261}
2689 2262
2690/** 2263/**
@@ -2704,7 +2277,6 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2704 struct cftype *cft; 2277 struct cftype *cft;
2705 int ret; 2278 int ret;
2706 2279
2707 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2708 lockdep_assert_held(&cgroup_tree_mutex); 2280 lockdep_assert_held(&cgroup_tree_mutex);
2709 2281
2710 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2282 for (cft = cfts; cft->name[0] != '\0'; cft++) {
@@ -2749,9 +2321,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2749 LIST_HEAD(pending); 2321 LIST_HEAD(pending);
2750 struct cgroup_subsys *ss = cfts[0].ss; 2322 struct cgroup_subsys *ss = cfts[0].ss;
2751 struct cgroup *root = &ss->root->top_cgroup; 2323 struct cgroup *root = &ss->root->top_cgroup;
2752 struct super_block *sb = ss->root->sb;
2753 struct cgroup *prev = NULL; 2324 struct cgroup *prev = NULL;
2754 struct inode *inode;
2755 struct cgroup_subsys_state *css; 2325 struct cgroup_subsys_state *css;
2756 u64 update_before; 2326 u64 update_before;
2757 int ret = 0; 2327 int ret = 0;
@@ -2759,12 +2329,13 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2759 mutex_unlock(&cgroup_mutex); 2329 mutex_unlock(&cgroup_mutex);
2760 2330
2761 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2331 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2762 if (!cfts || ss->root == &cgroup_dummy_root || 2332 if (!cfts || ss->root == &cgroup_dummy_root) {
2763 !atomic_inc_not_zero(&sb->s_active)) {
2764 mutex_unlock(&cgroup_tree_mutex); 2333 mutex_unlock(&cgroup_tree_mutex);
2765 return 0; 2334 return 0;
2766 } 2335 }
2767 2336
2337 cgroup_get_root(ss->root);
2338
2768 /* 2339 /*
2769 * All cgroups which are created after we drop cgroup_mutex will 2340 * All cgroups which are created after we drop cgroup_mutex will
2770 * have the updated set of files, so we only need to update the 2341 * have the updated set of files, so we only need to update the
@@ -2779,18 +2350,16 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2779 if (cgroup_is_dead(cgrp)) 2350 if (cgroup_is_dead(cgrp))
2780 continue; 2351 continue;
2781 2352
2782 inode = cgrp->dentry->d_inode;
2783 cgroup_get(cgrp); 2353 cgroup_get(cgrp);
2784 if (prev) 2354 if (prev)
2785 cgroup_put(prev); 2355 cgroup_put(prev);
2786 prev = cgrp; 2356 prev = cgrp;
2787 2357
2788 mutex_unlock(&cgroup_tree_mutex); 2358 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) {
2789 mutex_lock(&inode->i_mutex);
2790 mutex_lock(&cgroup_tree_mutex);
2791 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2792 ret = cgroup_addrm_files(cgrp, cfts, is_add); 2359 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2793 mutex_unlock(&inode->i_mutex); 2360 if (is_add)
2361 kernfs_activate(cgrp->kn);
2362 }
2794 if (ret) 2363 if (ret)
2795 break; 2364 break;
2796 } 2365 }
@@ -2804,16 +2373,45 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
2804{ 2373{
2805 struct cftype *cft; 2374 struct cftype *cft;
2806 2375
2807 for (cft = cfts; cft->name[0] != '\0'; cft++) 2376 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2377 /* free copy for custom atomic_write_len, see init_cftypes() */
2378 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2379 kfree(cft->kf_ops);
2380 cft->kf_ops = NULL;
2808 cft->ss = NULL; 2381 cft->ss = NULL;
2382 }
2809} 2383}
2810 2384
2811static void cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2385static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2812{ 2386{
2813 struct cftype *cft; 2387 struct cftype *cft;
2814 2388
2815 for (cft = cfts; cft->name[0] != '\0'; cft++) 2389 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2390 struct kernfs_ops *kf_ops;
2391
2392 if (cft->seq_start)
2393 kf_ops = &cgroup_kf_ops;
2394 else
2395 kf_ops = &cgroup_kf_single_ops;
2396
2397 /*
2398 * Ugh... if @cft wants a custom max_write_len, we need to
2399 * make a copy of kf_ops to set its atomic_write_len.
2400 */
2401 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2402 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2403 if (!kf_ops) {
2404 cgroup_exit_cftypes(cfts);
2405 return -ENOMEM;
2406 }
2407 kf_ops->atomic_write_len = cft->max_write_len;
2408 }
2409
2410 cft->kf_ops = kf_ops;
2816 cft->ss = ss; 2411 cft->ss = ss;
2412 }
2413
2414 return 0;
2817} 2415}
2818 2416
2819/** 2417/**
@@ -2839,7 +2437,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2839 if (!set) 2437 if (!set)
2840 return -ENOMEM; 2438 return -ENOMEM;
2841 2439
2842 cgroup_init_cftypes(ss, cfts); 2440 ret = cgroup_init_cftypes(ss, cfts);
2441 if (ret)
2442 return ret;
2843 2443
2844 cgroup_cfts_prepare(); 2444 cgroup_cfts_prepare();
2845 set->cfts = cfts; 2445 set->cfts = cfts;
@@ -3706,21 +3306,27 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3706 */ 3306 */
3707int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3307int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3708{ 3308{
3709 int ret = -EINVAL; 3309 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3710 struct cgroup *cgrp; 3310 struct cgroup *cgrp;
3711 struct css_task_iter it; 3311 struct css_task_iter it;
3712 struct task_struct *tsk; 3312 struct task_struct *tsk;
3713 3313
3314 /* it should be kernfs_node belonging to cgroupfs and is a directory */
3315 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3316 kernfs_type(kn) != KERNFS_DIR)
3317 return -EINVAL;
3318
3714 /* 3319 /*
3715 * Validate dentry by checking the superblock operations, 3320 * We aren't being called from kernfs and there's no guarantee on
3716 * and make sure it's a directory. 3321 * @kn->priv's validity. For this and css_tryget_from_dir(),
3322 * @kn->priv is RCU safe. Let's do the RCU dancing.
3717 */ 3323 */
3718 if (dentry->d_sb->s_op != &cgroup_ops || 3324 rcu_read_lock();
3719 !S_ISDIR(dentry->d_inode->i_mode)) 3325 cgrp = rcu_dereference(kn->priv);
3720 goto err; 3326 if (!cgrp) {
3721 3327 rcu_read_unlock();
3722 ret = 0; 3328 return -ENOENT;
3723 cgrp = dentry->d_fsdata; 3329 }
3724 3330
3725 css_task_iter_start(&cgrp->dummy_css, &it); 3331 css_task_iter_start(&cgrp->dummy_css, &it);
3726 while ((tsk = css_task_iter_next(&it))) { 3332 while ((tsk = css_task_iter_next(&it))) {
@@ -3745,8 +3351,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3745 } 3351 }
3746 css_task_iter_end(&it); 3352 css_task_iter_end(&it);
3747 3353
3748err: 3354 rcu_read_unlock();
3749 return ret; 3355 return 0;
3750} 3356}
3751 3357
3752 3358
@@ -3764,7 +3370,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3764 * after a seek to the start). Use a binary-search to find the 3370 * after a seek to the start). Use a binary-search to find the
3765 * next pid to display, if any 3371 * next pid to display, if any
3766 */ 3372 */
3767 struct cgroup_open_file *of = s->private; 3373 struct kernfs_open_file *of = s->private;
3768 struct cgroup *cgrp = seq_css(s)->cgroup; 3374 struct cgroup *cgrp = seq_css(s)->cgroup;
3769 struct cgroup_pidlist *l; 3375 struct cgroup_pidlist *l;
3770 enum cgroup_filetype type = seq_cft(s)->private; 3376 enum cgroup_filetype type = seq_cft(s)->private;
@@ -3819,7 +3425,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3819 3425
3820static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3426static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3821{ 3427{
3822 struct cgroup_open_file *of = s->private; 3428 struct kernfs_open_file *of = s->private;
3823 struct cgroup_pidlist *l = of->priv; 3429 struct cgroup_pidlist *l = of->priv;
3824 3430
3825 if (l) 3431 if (l)
@@ -3830,7 +3436,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3830 3436
3831static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3437static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3832{ 3438{
3833 struct cgroup_open_file *of = s->private; 3439 struct kernfs_open_file *of = s->private;
3834 struct cgroup_pidlist *l = of->priv; 3440 struct cgroup_pidlist *l = of->priv;
3835 pid_t *p = v; 3441 pid_t *p = v;
3836 pid_t *end = l->list + l->length; 3442 pid_t *end = l->list + l->length;
@@ -3880,21 +3486,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3880 return 0; 3486 return 0;
3881} 3487}
3882 3488
3883/*
3884 * When dput() is called asynchronously, if umount has been done and
3885 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3886 * there's a small window that vfs will see the root dentry with non-zero
3887 * refcnt and trigger BUG().
3888 *
3889 * That's why we hold a reference before dput() and drop it right after.
3890 */
3891static void cgroup_dput(struct cgroup *cgrp)
3892{
3893 cgroup_get_root(cgrp->root);
3894 cgroup_put(cgrp);
3895 cgroup_put_root(cgrp->root);
3896}
3897
3898static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3489static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3899 struct cftype *cft) 3490 struct cftype *cft)
3900{ 3491{
@@ -4029,7 +3620,7 @@ static void css_free_work_fn(struct work_struct *work)
4029 css_put(css->parent); 3620 css_put(css->parent);
4030 3621
4031 css->ss->css_free(css); 3622 css->ss->css_free(css);
4032 cgroup_dput(cgrp); 3623 cgroup_put(cgrp);
4033} 3624}
4034 3625
4035static void css_free_rcu_fn(struct rcu_head *rcu_head) 3626static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4037,10 +3628,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4037 struct cgroup_subsys_state *css = 3628 struct cgroup_subsys_state *css =
4038 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3629 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4039 3630
4040 /*
4041 * css holds an extra ref to @cgrp->dentry which is put on the last
4042 * css_put(). dput() requires process context which we don't have.
4043 */
4044 INIT_WORK(&css->destroy_work, css_free_work_fn); 3631 INIT_WORK(&css->destroy_work, css_free_work_fn);
4045 queue_work(cgroup_destroy_wq, &css->destroy_work); 3632 queue_work(cgroup_destroy_wq, &css->destroy_work);
4046} 3633}
@@ -4122,7 +3709,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4122 struct cgroup_subsys_state *css; 3709 struct cgroup_subsys_state *css;
4123 int err; 3710 int err;
4124 3711
4125 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4126 lockdep_assert_held(&cgroup_mutex); 3712 lockdep_assert_held(&cgroup_mutex);
4127 3713
4128 css = ss->css_alloc(cgroup_css(parent, ss)); 3714 css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4163,30 +3749,28 @@ err_free:
4163 return err; 3749 return err;
4164} 3750}
4165 3751
4166/* 3752/**
4167 * cgroup_create - create a cgroup 3753 * cgroup_create - create a cgroup
4168 * @parent: cgroup that will be parent of the new cgroup 3754 * @parent: cgroup that will be parent of the new cgroup
4169 * @dentry: dentry of the new cgroup 3755 * @name_str: name of the new cgroup
4170 * @mode: mode to set on new inode 3756 * @mode: mode to set on new cgroup
4171 *
4172 * Must be called with the mutex on the parent inode held
4173 */ 3757 */
4174static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3758static long cgroup_create(struct cgroup *parent, const char *name_str,
4175 umode_t mode) 3759 umode_t mode)
4176{ 3760{
4177 struct cgroup *cgrp; 3761 struct cgroup *cgrp;
4178 struct cgroup_name *name; 3762 struct cgroup_name *name;
4179 struct cgroupfs_root *root = parent->root; 3763 struct cgroupfs_root *root = parent->root;
4180 int ssid, err; 3764 int ssid, err;
4181 struct cgroup_subsys *ss; 3765 struct cgroup_subsys *ss;
4182 struct super_block *sb = root->sb; 3766 struct kernfs_node *kn;
4183 3767
4184 /* allocate the cgroup and its ID, 0 is reserved for the root */ 3768 /* allocate the cgroup and its ID, 0 is reserved for the root */
4185 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3769 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4186 if (!cgrp) 3770 if (!cgrp)
4187 return -ENOMEM; 3771 return -ENOMEM;
4188 3772
4189 name = cgroup_alloc_name(dentry->d_name.name); 3773 name = cgroup_alloc_name(name_str);
4190 if (!name) { 3774 if (!name) {
4191 err = -ENOMEM; 3775 err = -ENOMEM;
4192 goto err_free_cgrp; 3776 goto err_free_cgrp;
@@ -4217,18 +3801,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4217 goto err_unlock; 3801 goto err_unlock;
4218 } 3802 }
4219 3803
4220 /* Grab a reference on the superblock so the hierarchy doesn't
4221 * get deleted on unmount if there are child cgroups. This
4222 * can be done outside cgroup_mutex, since the sb can't
4223 * disappear while someone has an open control file on the
4224 * fs */
4225 cgroup_get_root(root);
4226
4227 init_cgroup_housekeeping(cgrp); 3804 init_cgroup_housekeeping(cgrp);
4228 3805
4229 dentry->d_fsdata = cgrp;
4230 cgrp->dentry = dentry;
4231
4232 cgrp->parent = parent; 3806 cgrp->parent = parent;
4233 cgrp->dummy_css.parent = &parent->dummy_css; 3807 cgrp->dummy_css.parent = &parent->dummy_css;
4234 cgrp->root = parent->root; 3808 cgrp->root = parent->root;
@@ -4239,15 +3813,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4239 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3813 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4240 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3814 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4241 3815
4242 /* 3816 /* create the directory */
4243 * Create directory. cgroup_create_file() returns with the new 3817 kn = kernfs_create_dir(parent->kn, name->name, mode, cgrp);
4244 * directory locked on success so that it can be populated without 3818 if (IS_ERR(kn)) {
4245 * dropping cgroup_mutex. 3819 err = PTR_ERR(kn);
4246 */
4247 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4248 if (err < 0)
4249 goto err_free_id; 3820 goto err_free_id;
4250 lockdep_assert_held(&dentry->d_inode->i_mutex); 3821 }
3822 cgrp->kn = kn;
4251 3823
4252 cgrp->serial_nr = cgroup_serial_nr_next++; 3824 cgrp->serial_nr = cgroup_serial_nr_next++;
4253 3825
@@ -4255,7 +3827,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4255 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 3827 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4256 root->number_of_cgroups++; 3828 root->number_of_cgroups++;
4257 3829
4258 /* hold a ref to the parent's dentry */ 3830 /*
3831 * Grab a reference on the root and parent so that they don't get
3832 * deleted while there are child cgroups.
3833 */
3834 cgroup_get_root(root);
4259 cgroup_get(parent); 3835 cgroup_get(parent);
4260 3836
4261 /* 3837 /*
@@ -4277,16 +3853,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4277 } 3853 }
4278 } 3854 }
4279 3855
3856 kernfs_activate(kn);
3857
4280 mutex_unlock(&cgroup_mutex); 3858 mutex_unlock(&cgroup_mutex);
4281 mutex_unlock(&cgroup_tree_mutex); 3859 mutex_unlock(&cgroup_tree_mutex);
4282 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4283 3860
4284 return 0; 3861 return 0;
4285 3862
4286err_free_id: 3863err_free_id:
4287 idr_remove(&root->cgroup_idr, cgrp->id); 3864 idr_remove(&root->cgroup_idr, cgrp->id);
4288 /* Release the reference count that we took on the superblock */
4289 cgroup_put_root(root);
4290err_unlock: 3865err_unlock:
4291 mutex_unlock(&cgroup_mutex); 3866 mutex_unlock(&cgroup_mutex);
4292err_unlock_tree: 3867err_unlock_tree:
@@ -4300,16 +3875,15 @@ err_destroy:
4300 cgroup_destroy_locked(cgrp); 3875 cgroup_destroy_locked(cgrp);
4301 mutex_unlock(&cgroup_mutex); 3876 mutex_unlock(&cgroup_mutex);
4302 mutex_unlock(&cgroup_tree_mutex); 3877 mutex_unlock(&cgroup_tree_mutex);
4303 mutex_unlock(&dentry->d_inode->i_mutex);
4304 return err; 3878 return err;
4305} 3879}
4306 3880
4307static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 3881static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3882 umode_t mode)
4308{ 3883{
4309 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3884 struct cgroup *parent = parent_kn->priv;
4310 3885
4311 /* the vfs holds inode->i_mutex already */ 3886 return cgroup_create(parent, name, mode);
4312 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4313} 3887}
4314 3888
4315/* 3889/*
@@ -4373,6 +3947,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4373 */ 3947 */
4374static void kill_css(struct cgroup_subsys_state *css) 3948static void kill_css(struct cgroup_subsys_state *css)
4375{ 3949{
3950 /*
3951 * This must happen before css is disassociated with its cgroup.
3952 * See seq_css() for details.
3953 */
4376 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 3954 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4377 3955
4378 /* 3956 /*
@@ -4421,13 +3999,12 @@ static void kill_css(struct cgroup_subsys_state *css)
4421static int cgroup_destroy_locked(struct cgroup *cgrp) 3999static int cgroup_destroy_locked(struct cgroup *cgrp)
4422 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4000 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4423{ 4001{
4424 struct dentry *d = cgrp->dentry;
4425 struct cgroup_subsys_state *css;
4426 struct cgroup *child; 4002 struct cgroup *child;
4003 struct cgroup_subsys_state *css;
4004 struct kernfs_node *kn;
4427 bool empty; 4005 bool empty;
4428 int ssid; 4006 int ssid;
4429 4007
4430 lockdep_assert_held(&d->d_inode->i_mutex);
4431 lockdep_assert_held(&cgroup_tree_mutex); 4008 lockdep_assert_held(&cgroup_tree_mutex);
4432 lockdep_assert_held(&cgroup_mutex); 4009 lockdep_assert_held(&cgroup_mutex);
4433 4010
@@ -4492,15 +4069,24 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4492 if (!cgrp->nr_css) 4069 if (!cgrp->nr_css)
4493 cgroup_destroy_css_killed(cgrp); 4070 cgroup_destroy_css_killed(cgrp);
4494 4071
4072 /* remove @cgrp directory along with the base files */
4073 mutex_unlock(&cgroup_mutex);
4074
4495 /* 4075 /*
4496 * Clear the base files and remove @cgrp directory. The removal 4076 * There are two control paths which try to determine cgroup from
4497 * puts the base ref but we aren't quite done with @cgrp yet, so 4077 * dentry without going through kernfs - cgroupstats_build() and
4498 * hold onto it. 4078 * css_tryget_from_dir(). Those are supported by RCU protecting
4079 * clearing of cgrp->kn->priv backpointer, which should happen
4080 * after all files under it have been removed.
4499 */ 4081 */
4500 mutex_unlock(&cgroup_mutex); 4082 kn = cgrp->kn;
4501 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4083 kernfs_get(kn);
4502 dget(d); 4084
4503 cgroup_d_remove_dir(d); 4085 kernfs_remove(cgrp->kn);
4086
4087 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4088 kernfs_put(kn);
4089
4504 mutex_lock(&cgroup_mutex); 4090 mutex_lock(&cgroup_mutex);
4505 4091
4506 return 0; 4092 return 0;
@@ -4531,19 +4117,46 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4531 check_for_release(parent); 4117 check_for_release(parent);
4532} 4118}
4533 4119
4534static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4120static int cgroup_rmdir(struct kernfs_node *kn)
4535{ 4121{
4536 int ret; 4122 struct cgroup *cgrp = kn->priv;
4123 int ret = 0;
4124
4125 /*
4126 * This is self-destruction but @kn can't be removed while this
4127 * callback is in progress. Let's break active protection. Once
4128 * the protection is broken, @cgrp can be destroyed at any point.
4129 * Pin it so that it stays accessible.
4130 */
4131 cgroup_get(cgrp);
4132 kernfs_break_active_protection(kn);
4537 4133
4538 mutex_lock(&cgroup_tree_mutex); 4134 mutex_lock(&cgroup_tree_mutex);
4539 mutex_lock(&cgroup_mutex); 4135 mutex_lock(&cgroup_mutex);
4540 ret = cgroup_destroy_locked(dentry->d_fsdata); 4136
4137 /*
4138 * @cgrp might already have been destroyed while we're trying to
4139 * grab the mutexes.
4140 */
4141 if (!cgroup_is_dead(cgrp))
4142 ret = cgroup_destroy_locked(cgrp);
4143
4541 mutex_unlock(&cgroup_mutex); 4144 mutex_unlock(&cgroup_mutex);
4542 mutex_unlock(&cgroup_tree_mutex); 4145 mutex_unlock(&cgroup_tree_mutex);
4543 4146
4147 kernfs_unbreak_active_protection(kn);
4148 cgroup_put(cgrp);
4544 return ret; 4149 return ret;
4545} 4150}
4546 4151
4152static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4153 .remount_fs = cgroup_remount,
4154 .show_options = cgroup_show_options,
4155 .mkdir = cgroup_mkdir,
4156 .rmdir = cgroup_rmdir,
4157 .rename = cgroup_rename,
4158};
4159
4547static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4160static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4548{ 4161{
4549 struct cgroup_subsys_state *css; 4162 struct cgroup_subsys_state *css;
@@ -4635,11 +4248,7 @@ int __init cgroup_init(void)
4635 unsigned long key; 4248 unsigned long key;
4636 int i, err; 4249 int i, err;
4637 4250
4638 err = bdi_init(&cgroup_backing_dev_info); 4251 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4639 if (err)
4640 return err;
4641
4642 cgroup_init_cftypes(NULL, cgroup_base_files);
4643 4252
4644 for_each_subsys(ss, i) { 4253 for_each_subsys(ss, i) {
4645 if (!ss->early_init) 4254 if (!ss->early_init)
@@ -4669,24 +4278,17 @@ int __init cgroup_init(void)
4669 mutex_unlock(&cgroup_mutex); 4278 mutex_unlock(&cgroup_mutex);
4670 4279
4671 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4280 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4672 if (!cgroup_kobj) { 4281 if (!cgroup_kobj)
4673 err = -ENOMEM; 4282 return -ENOMEM;
4674 goto out;
4675 }
4676 4283
4677 err = register_filesystem(&cgroup_fs_type); 4284 err = register_filesystem(&cgroup_fs_type);
4678 if (err < 0) { 4285 if (err < 0) {
4679 kobject_put(cgroup_kobj); 4286 kobject_put(cgroup_kobj);
4680 goto out; 4287 return err;
4681 } 4288 }
4682 4289
4683 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4290 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4684 4291 return 0;
4685out:
4686 if (err)
4687 bdi_destroy(&cgroup_backing_dev_info);
4688
4689 return err;
4690} 4292}
4691 4293
4692static int __init cgroup_wq_init(void) 4294static int __init cgroup_wq_init(void)
@@ -5095,18 +4697,25 @@ __setup("cgroup_disable=", cgroup_disable);
5095struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 4697struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
5096 struct cgroup_subsys *ss) 4698 struct cgroup_subsys *ss)
5097{ 4699{
4700 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4701 struct cgroup_subsys_state *css = NULL;
5098 struct cgroup *cgrp; 4702 struct cgroup *cgrp;
5099 struct cgroup_subsys_state *css;
5100 4703
5101 /* is @dentry a cgroup dir? */ 4704 /* is @dentry a cgroup dir? */
5102 if (!dentry->d_inode || 4705 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5103 dentry->d_inode->i_op != &cgroup_dir_inode_operations) 4706 kernfs_type(kn) != KERNFS_DIR)
5104 return ERR_PTR(-EBADF); 4707 return ERR_PTR(-EBADF);
5105 4708
5106 rcu_read_lock(); 4709 rcu_read_lock();
5107 4710
5108 cgrp = __d_cgrp(dentry); 4711 /*
5109 css = cgroup_css(cgrp, ss); 4712 * This path doesn't originate from kernfs and @kn could already
4713 * have been or be removed at any point. @kn->priv is RCU
4714 * protected for this access. See destroy_locked() for details.
4715 */
4716 cgrp = rcu_dereference(kn->priv);
4717 if (cgrp)
4718 css = cgroup_css(cgrp, ss);
5110 4719
5111 if (!css || !css_tryget(css)) 4720 if (!css || !css_tryget(css))
5112 css = ERR_PTR(-ENOENT); 4721 css = ERR_PTR(-ENOENT);