diff options
Diffstat (limited to 'kernel/cgroup.c')
| -rw-r--r-- | kernel/cgroup.c | 1831 |
1 files changed, 1185 insertions, 646 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9fcdaa705b6c..7868fc3c0bc5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -26,6 +26,8 @@ | |||
| 26 | * distribution for more details. | 26 | * distribution for more details. |
| 27 | */ | 27 | */ |
| 28 | 28 | ||
| 29 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 30 | |||
| 29 | #include <linux/cgroup.h> | 31 | #include <linux/cgroup.h> |
| 30 | #include <linux/cred.h> | 32 | #include <linux/cred.h> |
| 31 | #include <linux/ctype.h> | 33 | #include <linux/ctype.h> |
| @@ -33,6 +35,7 @@ | |||
| 33 | #include <linux/init_task.h> | 35 | #include <linux/init_task.h> |
| 34 | #include <linux/kernel.h> | 36 | #include <linux/kernel.h> |
| 35 | #include <linux/list.h> | 37 | #include <linux/list.h> |
| 38 | #include <linux/magic.h> | ||
| 36 | #include <linux/mm.h> | 39 | #include <linux/mm.h> |
| 37 | #include <linux/mutex.h> | 40 | #include <linux/mutex.h> |
| 38 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
| @@ -69,15 +72,6 @@ | |||
| 69 | MAX_CFTYPE_NAME + 2) | 72 | MAX_CFTYPE_NAME + 2) |
| 70 | 73 | ||
| 71 | /* | 74 | /* |
| 72 | * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file | ||
| 73 | * creation/removal and hierarchy changing operations including cgroup | ||
| 74 | * creation, removal, css association and controller rebinding. This outer | ||
| 75 | * lock is needed mainly to resolve the circular dependency between kernfs | ||
| 76 | * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. | ||
| 77 | */ | ||
| 78 | static DEFINE_MUTEX(cgroup_tree_mutex); | ||
| 79 | |||
| 80 | /* | ||
| 81 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 75 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
| 82 | * hierarchy must be performed while holding it. | 76 | * hierarchy must be performed while holding it. |
| 83 | * | 77 | * |
| @@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem); | |||
| 98 | #endif | 92 | #endif |
| 99 | 93 | ||
| 100 | /* | 94 | /* |
| 95 | * Protects cgroup_idr and css_idr so that IDs can be released without | ||
| 96 | * grabbing cgroup_mutex. | ||
| 97 | */ | ||
| 98 | static DEFINE_SPINLOCK(cgroup_idr_lock); | ||
| 99 | |||
| 100 | /* | ||
| 101 | * Protects cgroup_subsys->release_agent_path. Modifying it also requires | 101 | * Protects cgroup_subsys->release_agent_path. Modifying it also requires |
| 102 | * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. | 102 | * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. |
| 103 | */ | 103 | */ |
| 104 | static DEFINE_SPINLOCK(release_agent_path_lock); | 104 | static DEFINE_SPINLOCK(release_agent_path_lock); |
| 105 | 105 | ||
| 106 | #define cgroup_assert_mutexes_or_rcu_locked() \ | 106 | #define cgroup_assert_mutex_or_rcu_locked() \ |
| 107 | rcu_lockdep_assert(rcu_read_lock_held() || \ | 107 | rcu_lockdep_assert(rcu_read_lock_held() || \ |
| 108 | lockdep_is_held(&cgroup_tree_mutex) || \ | ||
| 109 | lockdep_is_held(&cgroup_mutex), \ | 108 | lockdep_is_held(&cgroup_mutex), \ |
| 110 | "cgroup_[tree_]mutex or RCU read lock required"); | 109 | "cgroup_mutex or RCU read lock required"); |
| 111 | 110 | ||
| 112 | /* | 111 | /* |
| 113 | * cgroup destruction makes heavy use of work items and there can be a lot | 112 | * cgroup destruction makes heavy use of work items and there can be a lot |
| @@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root; | |||
| 150 | */ | 149 | */ |
| 151 | static bool cgrp_dfl_root_visible; | 150 | static bool cgrp_dfl_root_visible; |
| 152 | 151 | ||
| 152 | /* some controllers are not supported in the default hierarchy */ | ||
| 153 | static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 | ||
| 154 | #ifdef CONFIG_CGROUP_DEBUG | ||
| 155 | | (1 << debug_cgrp_id) | ||
| 156 | #endif | ||
| 157 | ; | ||
| 158 | |||
| 153 | /* The list of hierarchy roots */ | 159 | /* The list of hierarchy roots */ |
| 154 | 160 | ||
| 155 | static LIST_HEAD(cgroup_roots); | 161 | static LIST_HEAD(cgroup_roots); |
| @@ -159,14 +165,13 @@ static int cgroup_root_count; | |||
| 159 | static DEFINE_IDR(cgroup_hierarchy_idr); | 165 | static DEFINE_IDR(cgroup_hierarchy_idr); |
| 160 | 166 | ||
| 161 | /* | 167 | /* |
| 162 | * Assign a monotonically increasing serial number to cgroups. It | 168 | * Assign a monotonically increasing serial number to csses. It guarantees |
| 163 | * guarantees cgroups with bigger numbers are newer than those with smaller | 169 | * cgroups with bigger numbers are newer than those with smaller numbers. |
| 164 | * numbers. Also, as cgroups are always appended to the parent's | 170 | * Also, as csses are always appended to the parent's ->children list, it |
| 165 | * ->children list, it guarantees that sibling cgroups are always sorted in | 171 | * guarantees that sibling csses are always sorted in the ascending serial |
| 166 | * the ascending serial number order on the list. Protected by | 172 | * number order on the list. Protected by cgroup_mutex. |
| 167 | * cgroup_mutex. | ||
| 168 | */ | 173 | */ |
| 169 | static u64 cgroup_serial_nr_next = 1; | 174 | static u64 css_serial_nr_next = 1; |
| 170 | 175 | ||
| 171 | /* This flag indicates whether tasks in the fork and exit paths should | 176 | /* This flag indicates whether tasks in the fork and exit paths should |
| 172 | * check for fork/exit handlers to call. This avoids us having to do | 177 | * check for fork/exit handlers to call. This avoids us having to do |
| @@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[]; | |||
| 179 | 184 | ||
| 180 | static void cgroup_put(struct cgroup *cgrp); | 185 | static void cgroup_put(struct cgroup *cgrp); |
| 181 | static int rebind_subsystems(struct cgroup_root *dst_root, | 186 | static int rebind_subsystems(struct cgroup_root *dst_root, |
| 182 | unsigned long ss_mask); | 187 | unsigned int ss_mask); |
| 183 | static void cgroup_destroy_css_killed(struct cgroup *cgrp); | ||
| 184 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 188 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
| 189 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); | ||
| 190 | static void css_release(struct percpu_ref *ref); | ||
| 191 | static void kill_css(struct cgroup_subsys_state *css); | ||
| 185 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 192 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
| 186 | bool is_add); | 193 | bool is_add); |
| 187 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); | 194 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); |
| 188 | 195 | ||
| 196 | /* IDR wrappers which synchronize using cgroup_idr_lock */ | ||
| 197 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | ||
| 198 | gfp_t gfp_mask) | ||
| 199 | { | ||
| 200 | int ret; | ||
| 201 | |||
| 202 | idr_preload(gfp_mask); | ||
| 203 | spin_lock_bh(&cgroup_idr_lock); | ||
| 204 | ret = idr_alloc(idr, ptr, start, end, gfp_mask); | ||
| 205 | spin_unlock_bh(&cgroup_idr_lock); | ||
| 206 | idr_preload_end(); | ||
| 207 | return ret; | ||
| 208 | } | ||
| 209 | |||
| 210 | static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) | ||
| 211 | { | ||
| 212 | void *ret; | ||
| 213 | |||
| 214 | spin_lock_bh(&cgroup_idr_lock); | ||
| 215 | ret = idr_replace(idr, ptr, id); | ||
| 216 | spin_unlock_bh(&cgroup_idr_lock); | ||
| 217 | return ret; | ||
| 218 | } | ||
| 219 | |||
| 220 | static void cgroup_idr_remove(struct idr *idr, int id) | ||
| 221 | { | ||
| 222 | spin_lock_bh(&cgroup_idr_lock); | ||
| 223 | idr_remove(idr, id); | ||
| 224 | spin_unlock_bh(&cgroup_idr_lock); | ||
| 225 | } | ||
| 226 | |||
| 227 | static struct cgroup *cgroup_parent(struct cgroup *cgrp) | ||
| 228 | { | ||
| 229 | struct cgroup_subsys_state *parent_css = cgrp->self.parent; | ||
| 230 | |||
| 231 | if (parent_css) | ||
| 232 | return container_of(parent_css, struct cgroup, self); | ||
| 233 | return NULL; | ||
| 234 | } | ||
| 235 | |||
| 189 | /** | 236 | /** |
| 190 | * cgroup_css - obtain a cgroup's css for the specified subsystem | 237 | * cgroup_css - obtain a cgroup's css for the specified subsystem |
| 191 | * @cgrp: the cgroup of interest | 238 | * @cgrp: the cgroup of interest |
| 192 | * @ss: the subsystem of interest (%NULL returns the dummy_css) | 239 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) |
| 193 | * | 240 | * |
| 194 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This | 241 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This |
| 195 | * function must be called either under cgroup_mutex or rcu_read_lock() and | 242 | * function must be called either under cgroup_mutex or rcu_read_lock() and |
| @@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | |||
| 202 | { | 249 | { |
| 203 | if (ss) | 250 | if (ss) |
| 204 | return rcu_dereference_check(cgrp->subsys[ss->id], | 251 | return rcu_dereference_check(cgrp->subsys[ss->id], |
| 205 | lockdep_is_held(&cgroup_tree_mutex) || | ||
| 206 | lockdep_is_held(&cgroup_mutex)); | 252 | lockdep_is_held(&cgroup_mutex)); |
| 207 | else | 253 | else |
| 208 | return &cgrp->dummy_css; | 254 | return &cgrp->self; |
| 255 | } | ||
| 256 | |||
| 257 | /** | ||
| 258 | * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem | ||
| 259 | * @cgrp: the cgroup of interest | ||
| 260 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) | ||
| 261 | * | ||
| 262 | * Similar to cgroup_css() but returns the effctive css, which is defined | ||
| 263 | * as the matching css of the nearest ancestor including self which has @ss | ||
| 264 | * enabled. If @ss is associated with the hierarchy @cgrp is on, this | ||
| 265 | * function is guaranteed to return non-NULL css. | ||
| 266 | */ | ||
| 267 | static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | ||
| 268 | struct cgroup_subsys *ss) | ||
| 269 | { | ||
| 270 | lockdep_assert_held(&cgroup_mutex); | ||
| 271 | |||
| 272 | if (!ss) | ||
| 273 | return &cgrp->self; | ||
| 274 | |||
| 275 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) | ||
| 276 | return NULL; | ||
| 277 | |||
| 278 | while (cgroup_parent(cgrp) && | ||
| 279 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) | ||
| 280 | cgrp = cgroup_parent(cgrp); | ||
| 281 | |||
| 282 | return cgroup_css(cgrp, ss); | ||
| 209 | } | 283 | } |
| 210 | 284 | ||
| 211 | /* convenient tests for these bits */ | 285 | /* convenient tests for these bits */ |
| 212 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 286 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
| 213 | { | 287 | { |
| 214 | return test_bit(CGRP_DEAD, &cgrp->flags); | 288 | return !(cgrp->self.flags & CSS_ONLINE); |
| 215 | } | 289 | } |
| 216 | 290 | ||
| 217 | struct cgroup_subsys_state *seq_css(struct seq_file *seq) | 291 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) |
| 218 | { | 292 | { |
| 219 | struct kernfs_open_file *of = seq->private; | ||
| 220 | struct cgroup *cgrp = of->kn->parent->priv; | 293 | struct cgroup *cgrp = of->kn->parent->priv; |
| 221 | struct cftype *cft = seq_cft(seq); | 294 | struct cftype *cft = of_cft(of); |
| 222 | 295 | ||
| 223 | /* | 296 | /* |
| 224 | * This is open and unprotected implementation of cgroup_css(). | 297 | * This is open and unprotected implementation of cgroup_css(). |
| @@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq) | |||
| 231 | if (cft->ss) | 304 | if (cft->ss) |
| 232 | return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); | 305 | return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); |
| 233 | else | 306 | else |
| 234 | return &cgrp->dummy_css; | 307 | return &cgrp->self; |
| 235 | } | 308 | } |
| 236 | EXPORT_SYMBOL_GPL(seq_css); | 309 | EXPORT_SYMBOL_GPL(of_css); |
| 237 | 310 | ||
| 238 | /** | 311 | /** |
| 239 | * cgroup_is_descendant - test ancestry | 312 | * cgroup_is_descendant - test ancestry |
| @@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) | |||
| 249 | while (cgrp) { | 322 | while (cgrp) { |
| 250 | if (cgrp == ancestor) | 323 | if (cgrp == ancestor) |
| 251 | return true; | 324 | return true; |
| 252 | cgrp = cgrp->parent; | 325 | cgrp = cgroup_parent(cgrp); |
| 253 | } | 326 | } |
| 254 | return false; | 327 | return false; |
| 255 | } | 328 | } |
| @@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
| 273 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end | 346 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end |
| 274 | * @cgrp: the target cgroup to iterate css's of | 347 | * @cgrp: the target cgroup to iterate css's of |
| 275 | * | 348 | * |
| 276 | * Should be called under cgroup_mutex. | 349 | * Should be called under cgroup_[tree_]mutex. |
| 277 | */ | 350 | */ |
| 278 | #define for_each_css(css, ssid, cgrp) \ | 351 | #define for_each_css(css, ssid, cgrp) \ |
| 279 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ | 352 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
| 280 | if (!((css) = rcu_dereference_check( \ | 353 | if (!((css) = rcu_dereference_check( \ |
| 281 | (cgrp)->subsys[(ssid)], \ | 354 | (cgrp)->subsys[(ssid)], \ |
| 282 | lockdep_is_held(&cgroup_tree_mutex) || \ | ||
| 283 | lockdep_is_held(&cgroup_mutex)))) { } \ | 355 | lockdep_is_held(&cgroup_mutex)))) { } \ |
| 284 | else | 356 | else |
| 285 | 357 | ||
| 286 | /** | 358 | /** |
| 359 | * for_each_e_css - iterate all effective css's of a cgroup | ||
| 360 | * @css: the iteration cursor | ||
| 361 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end | ||
| 362 | * @cgrp: the target cgroup to iterate css's of | ||
| 363 | * | ||
| 364 | * Should be called under cgroup_[tree_]mutex. | ||
| 365 | */ | ||
| 366 | #define for_each_e_css(css, ssid, cgrp) \ | ||
| 367 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ | ||
| 368 | if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ | ||
| 369 | ; \ | ||
| 370 | else | ||
| 371 | |||
| 372 | /** | ||
| 287 | * for_each_subsys - iterate all enabled cgroup subsystems | 373 | * for_each_subsys - iterate all enabled cgroup subsystems |
| 288 | * @ss: the iteration cursor | 374 | * @ss: the iteration cursor |
| 289 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | 375 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end |
| @@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
| 296 | #define for_each_root(root) \ | 382 | #define for_each_root(root) \ |
| 297 | list_for_each_entry((root), &cgroup_roots, root_list) | 383 | list_for_each_entry((root), &cgroup_roots, root_list) |
| 298 | 384 | ||
| 299 | /** | 385 | /* iterate over child cgrps, lock should be held throughout iteration */ |
| 300 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. | 386 | #define cgroup_for_each_live_child(child, cgrp) \ |
| 301 | * @cgrp: the cgroup to be checked for liveness | 387 | list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ |
| 302 | * | 388 | if (({ lockdep_assert_held(&cgroup_mutex); \ |
| 303 | * On success, returns true; the mutex should be later unlocked. On | 389 | cgroup_is_dead(child); })) \ |
| 304 | * failure returns false with no lock held. | 390 | ; \ |
| 305 | */ | 391 | else |
| 306 | static bool cgroup_lock_live_group(struct cgroup *cgrp) | ||
| 307 | { | ||
| 308 | mutex_lock(&cgroup_mutex); | ||
| 309 | if (cgroup_is_dead(cgrp)) { | ||
| 310 | mutex_unlock(&cgroup_mutex); | ||
| 311 | return false; | ||
| 312 | } | ||
| 313 | return true; | ||
| 314 | } | ||
| 315 | 392 | ||
| 316 | /* the list of cgroups eligible for automatic release. Protected by | 393 | /* the list of cgroups eligible for automatic release. Protected by |
| 317 | * release_list_lock */ | 394 | * release_list_lock */ |
| @@ -348,7 +425,7 @@ struct cgrp_cset_link { | |||
| 348 | * reference-counted, to improve performance when child cgroups | 425 | * reference-counted, to improve performance when child cgroups |
| 349 | * haven't been created. | 426 | * haven't been created. |
| 350 | */ | 427 | */ |
| 351 | static struct css_set init_css_set = { | 428 | struct css_set init_css_set = { |
| 352 | .refcount = ATOMIC_INIT(1), | 429 | .refcount = ATOMIC_INIT(1), |
| 353 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), | 430 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
| 354 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), | 431 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
| @@ -359,6 +436,43 @@ static struct css_set init_css_set = { | |||
| 359 | 436 | ||
| 360 | static int css_set_count = 1; /* 1 for init_css_set */ | 437 | static int css_set_count = 1; /* 1 for init_css_set */ |
| 361 | 438 | ||
| 439 | /** | ||
| 440 | * cgroup_update_populated - updated populated count of a cgroup | ||
| 441 | * @cgrp: the target cgroup | ||
| 442 | * @populated: inc or dec populated count | ||
| 443 | * | ||
| 444 | * @cgrp is either getting the first task (css_set) or losing the last. | ||
| 445 | * Update @cgrp->populated_cnt accordingly. The count is propagated | ||
| 446 | * towards root so that a given cgroup's populated_cnt is zero iff the | ||
| 447 | * cgroup and all its descendants are empty. | ||
| 448 | * | ||
| 449 | * @cgrp's interface file "cgroup.populated" is zero if | ||
| 450 | * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt | ||
| 451 | * changes from or to zero, userland is notified that the content of the | ||
| 452 | * interface file has changed. This can be used to detect when @cgrp and | ||
| 453 | * its descendants become populated or empty. | ||
| 454 | */ | ||
| 455 | static void cgroup_update_populated(struct cgroup *cgrp, bool populated) | ||
| 456 | { | ||
| 457 | lockdep_assert_held(&css_set_rwsem); | ||
| 458 | |||
| 459 | do { | ||
| 460 | bool trigger; | ||
| 461 | |||
| 462 | if (populated) | ||
| 463 | trigger = !cgrp->populated_cnt++; | ||
| 464 | else | ||
| 465 | trigger = !--cgrp->populated_cnt; | ||
| 466 | |||
| 467 | if (!trigger) | ||
| 468 | break; | ||
| 469 | |||
| 470 | if (cgrp->populated_kn) | ||
| 471 | kernfs_notify(cgrp->populated_kn); | ||
| 472 | cgrp = cgroup_parent(cgrp); | ||
| 473 | } while (cgrp); | ||
| 474 | } | ||
| 475 | |||
| 362 | /* | 476 | /* |
| 363 | * hash table for cgroup groups. This improves the performance to find | 477 | * hash table for cgroup groups. This improves the performance to find |
| 364 | * an existing css_set. This hash doesn't (currently) take into | 478 | * an existing css_set. This hash doesn't (currently) take into |
| @@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
| 383 | static void put_css_set_locked(struct css_set *cset, bool taskexit) | 497 | static void put_css_set_locked(struct css_set *cset, bool taskexit) |
| 384 | { | 498 | { |
| 385 | struct cgrp_cset_link *link, *tmp_link; | 499 | struct cgrp_cset_link *link, *tmp_link; |
| 500 | struct cgroup_subsys *ss; | ||
| 501 | int ssid; | ||
| 386 | 502 | ||
| 387 | lockdep_assert_held(&css_set_rwsem); | 503 | lockdep_assert_held(&css_set_rwsem); |
| 388 | 504 | ||
| @@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
| 390 | return; | 506 | return; |
| 391 | 507 | ||
| 392 | /* This css_set is dead. unlink it and release cgroup refcounts */ | 508 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
| 509 | for_each_subsys(ss, ssid) | ||
| 510 | list_del(&cset->e_cset_node[ssid]); | ||
| 393 | hash_del(&cset->hlist); | 511 | hash_del(&cset->hlist); |
| 394 | css_set_count--; | 512 | css_set_count--; |
| 395 | 513 | ||
| @@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
| 400 | list_del(&link->cgrp_link); | 518 | list_del(&link->cgrp_link); |
| 401 | 519 | ||
| 402 | /* @cgrp can't go away while we're holding css_set_rwsem */ | 520 | /* @cgrp can't go away while we're holding css_set_rwsem */ |
| 403 | if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { | 521 | if (list_empty(&cgrp->cset_links)) { |
| 404 | if (taskexit) | 522 | cgroup_update_populated(cgrp, false); |
| 405 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 523 | if (notify_on_release(cgrp)) { |
| 406 | check_for_release(cgrp); | 524 | if (taskexit) |
| 525 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
| 526 | check_for_release(cgrp); | ||
| 527 | } | ||
| 407 | } | 528 | } |
| 408 | 529 | ||
| 409 | kfree(link); | 530 | kfree(link); |
| @@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset, | |||
| 452 | { | 573 | { |
| 453 | struct list_head *l1, *l2; | 574 | struct list_head *l1, *l2; |
| 454 | 575 | ||
| 455 | if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { | 576 | /* |
| 456 | /* Not all subsystems matched */ | 577 | * On the default hierarchy, there can be csets which are |
| 578 | * associated with the same set of cgroups but different csses. | ||
| 579 | * Let's first ensure that csses match. | ||
| 580 | */ | ||
| 581 | if (memcmp(template, cset->subsys, sizeof(cset->subsys))) | ||
| 457 | return false; | 582 | return false; |
| 458 | } | ||
| 459 | 583 | ||
| 460 | /* | 584 | /* |
| 461 | * Compare cgroup pointers in order to distinguish between | 585 | * Compare cgroup pointers in order to distinguish between |
| 462 | * different cgroups in heirarchies with no subsystems. We | 586 | * different cgroups in hierarchies. As different cgroups may |
| 463 | * could get by with just this check alone (and skip the | 587 | * share the same effective css, this comparison is always |
| 464 | * memcmp above) but on most setups the memcmp check will | 588 | * necessary. |
| 465 | * avoid the need for this more expensive check on almost all | ||
| 466 | * candidates. | ||
| 467 | */ | 589 | */ |
| 468 | |||
| 469 | l1 = &cset->cgrp_links; | 590 | l1 = &cset->cgrp_links; |
| 470 | l2 = &old_cset->cgrp_links; | 591 | l2 = &old_cset->cgrp_links; |
| 471 | while (1) { | 592 | while (1) { |
| @@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, | |||
| 529 | * won't change, so no need for locking. | 650 | * won't change, so no need for locking. |
| 530 | */ | 651 | */ |
| 531 | for_each_subsys(ss, i) { | 652 | for_each_subsys(ss, i) { |
| 532 | if (root->cgrp.subsys_mask & (1UL << i)) { | 653 | if (root->subsys_mask & (1UL << i)) { |
| 533 | /* Subsystem is in this hierarchy. So we want | 654 | /* |
| 534 | * the subsystem state from the new | 655 | * @ss is in this hierarchy, so we want the |
| 535 | * cgroup */ | 656 | * effective css from @cgrp. |
| 536 | template[i] = cgroup_css(cgrp, ss); | 657 | */ |
| 658 | template[i] = cgroup_e_css(cgrp, ss); | ||
| 537 | } else { | 659 | } else { |
| 538 | /* Subsystem is not in this hierarchy, so we | 660 | /* |
| 539 | * don't want to change the subsystem state */ | 661 | * @ss is not in this hierarchy, so we don't want |
| 662 | * to change the css. | ||
| 663 | */ | ||
| 540 | template[i] = old_cset->subsys[i]; | 664 | template[i] = old_cset->subsys[i]; |
| 541 | } | 665 | } |
| 542 | } | 666 | } |
| @@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, | |||
| 602 | struct cgrp_cset_link *link; | 726 | struct cgrp_cset_link *link; |
| 603 | 727 | ||
| 604 | BUG_ON(list_empty(tmp_links)); | 728 | BUG_ON(list_empty(tmp_links)); |
| 729 | |||
| 730 | if (cgroup_on_dfl(cgrp)) | ||
| 731 | cset->dfl_cgrp = cgrp; | ||
| 732 | |||
| 605 | link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); | 733 | link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); |
| 606 | link->cset = cset; | 734 | link->cset = cset; |
| 607 | link->cgrp = cgrp; | 735 | link->cgrp = cgrp; |
| 736 | |||
| 737 | if (list_empty(&cgrp->cset_links)) | ||
| 738 | cgroup_update_populated(cgrp, true); | ||
| 608 | list_move(&link->cset_link, &cgrp->cset_links); | 739 | list_move(&link->cset_link, &cgrp->cset_links); |
| 740 | |||
| 609 | /* | 741 | /* |
| 610 | * Always add links to the tail of the list so that the list | 742 | * Always add links to the tail of the list so that the list |
| 611 | * is sorted by order of hierarchy creation | 743 | * is sorted by order of hierarchy creation |
| @@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
| 628 | struct css_set *cset; | 760 | struct css_set *cset; |
| 629 | struct list_head tmp_links; | 761 | struct list_head tmp_links; |
| 630 | struct cgrp_cset_link *link; | 762 | struct cgrp_cset_link *link; |
| 763 | struct cgroup_subsys *ss; | ||
| 631 | unsigned long key; | 764 | unsigned long key; |
| 765 | int ssid; | ||
| 632 | 766 | ||
| 633 | lockdep_assert_held(&cgroup_mutex); | 767 | lockdep_assert_held(&cgroup_mutex); |
| 634 | 768 | ||
| @@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
| 679 | 813 | ||
| 680 | css_set_count++; | 814 | css_set_count++; |
| 681 | 815 | ||
| 682 | /* Add this cgroup group to the hash table */ | 816 | /* Add @cset to the hash table */ |
| 683 | key = css_set_hash(cset->subsys); | 817 | key = css_set_hash(cset->subsys); |
| 684 | hash_add(css_set_table, &cset->hlist, key); | 818 | hash_add(css_set_table, &cset->hlist, key); |
| 685 | 819 | ||
| 820 | for_each_subsys(ss, ssid) | ||
| 821 | list_add_tail(&cset->e_cset_node[ssid], | ||
| 822 | &cset->subsys[ssid]->cgroup->e_csets[ssid]); | ||
| 823 | |||
| 686 | up_write(&css_set_rwsem); | 824 | up_write(&css_set_rwsem); |
| 687 | 825 | ||
| 688 | return cset; | 826 | return cset; |
| @@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
| 735 | struct cgroup *cgrp = &root->cgrp; | 873 | struct cgroup *cgrp = &root->cgrp; |
| 736 | struct cgrp_cset_link *link, *tmp_link; | 874 | struct cgrp_cset_link *link, *tmp_link; |
| 737 | 875 | ||
| 738 | mutex_lock(&cgroup_tree_mutex); | ||
| 739 | mutex_lock(&cgroup_mutex); | 876 | mutex_lock(&cgroup_mutex); |
| 740 | 877 | ||
| 741 | BUG_ON(atomic_read(&root->nr_cgrps)); | 878 | BUG_ON(atomic_read(&root->nr_cgrps)); |
| 742 | BUG_ON(!list_empty(&cgrp->children)); | 879 | BUG_ON(!list_empty(&cgrp->self.children)); |
| 743 | 880 | ||
| 744 | /* Rebind all subsystems back to the default hierarchy */ | 881 | /* Rebind all subsystems back to the default hierarchy */ |
| 745 | rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); | 882 | rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); |
| 746 | 883 | ||
| 747 | /* | 884 | /* |
| 748 | * Release all the links from cset_links to this hierarchy's | 885 | * Release all the links from cset_links to this hierarchy's |
| @@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
| 765 | cgroup_exit_root_id(root); | 902 | cgroup_exit_root_id(root); |
| 766 | 903 | ||
| 767 | mutex_unlock(&cgroup_mutex); | 904 | mutex_unlock(&cgroup_mutex); |
| 768 | mutex_unlock(&cgroup_tree_mutex); | ||
| 769 | 905 | ||
| 770 | kernfs_destroy_root(root->kf_root); | 906 | kernfs_destroy_root(root->kf_root); |
| 771 | cgroup_free_root(root); | 907 | cgroup_free_root(root); |
| @@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
| 848 | * update of a tasks cgroup pointer by cgroup_attach_task() | 984 | * update of a tasks cgroup pointer by cgroup_attach_task() |
| 849 | */ | 985 | */ |
| 850 | 986 | ||
| 851 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); | 987 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); |
| 852 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; | 988 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; |
| 853 | static const struct file_operations proc_cgroupstats_operations; | 989 | static const struct file_operations proc_cgroupstats_operations; |
| 854 | 990 | ||
| @@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
| 883 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) | 1019 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) |
| 884 | mode |= S_IRUGO; | 1020 | mode |= S_IRUGO; |
| 885 | 1021 | ||
| 886 | if (cft->write_u64 || cft->write_s64 || cft->write_string || | 1022 | if (cft->write_u64 || cft->write_s64 || cft->write) |
| 887 | cft->trigger) | ||
| 888 | mode |= S_IWUSR; | 1023 | mode |= S_IWUSR; |
| 889 | 1024 | ||
| 890 | return mode; | 1025 | return mode; |
| 891 | } | 1026 | } |
| 892 | 1027 | ||
| 893 | static void cgroup_free_fn(struct work_struct *work) | 1028 | static void cgroup_get(struct cgroup *cgrp) |
| 894 | { | 1029 | { |
| 895 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | 1030 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); |
| 896 | 1031 | css_get(&cgrp->self); | |
| 897 | atomic_dec(&cgrp->root->nr_cgrps); | ||
| 898 | cgroup_pidlist_destroy_all(cgrp); | ||
| 899 | |||
| 900 | if (cgrp->parent) { | ||
| 901 | /* | ||
| 902 | * We get a ref to the parent, and put the ref when this | ||
| 903 | * cgroup is being freed, so it's guaranteed that the | ||
| 904 | * parent won't be destroyed before its children. | ||
| 905 | */ | ||
| 906 | cgroup_put(cgrp->parent); | ||
| 907 | kernfs_put(cgrp->kn); | ||
| 908 | kfree(cgrp); | ||
| 909 | } else { | ||
| 910 | /* | ||
| 911 | * This is root cgroup's refcnt reaching zero, which | ||
| 912 | * indicates that the root should be released. | ||
| 913 | */ | ||
| 914 | cgroup_destroy_root(cgrp->root); | ||
| 915 | } | ||
| 916 | } | 1032 | } |
| 917 | 1033 | ||
| 918 | static void cgroup_free_rcu(struct rcu_head *head) | 1034 | static void cgroup_put(struct cgroup *cgrp) |
| 919 | { | 1035 | { |
| 920 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | 1036 | css_put(&cgrp->self); |
| 921 | |||
| 922 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); | ||
| 923 | queue_work(cgroup_destroy_wq, &cgrp->destroy_work); | ||
| 924 | } | 1037 | } |
| 925 | 1038 | ||
| 926 | static void cgroup_get(struct cgroup *cgrp) | 1039 | /** |
| 1040 | * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods | ||
| 1041 | * @kn: the kernfs_node being serviced | ||
| 1042 | * | ||
| 1043 | * This helper undoes cgroup_kn_lock_live() and should be invoked before | ||
| 1044 | * the method finishes if locking succeeded. Note that once this function | ||
| 1045 | * returns the cgroup returned by cgroup_kn_lock_live() may become | ||
| 1046 | * inaccessible any time. If the caller intends to continue to access the | ||
| 1047 | * cgroup, it should pin it before invoking this function. | ||
| 1048 | */ | ||
| 1049 | static void cgroup_kn_unlock(struct kernfs_node *kn) | ||
| 927 | { | 1050 | { |
| 928 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | 1051 | struct cgroup *cgrp; |
| 929 | WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); | 1052 | |
| 930 | atomic_inc(&cgrp->refcnt); | 1053 | if (kernfs_type(kn) == KERNFS_DIR) |
| 1054 | cgrp = kn->priv; | ||
| 1055 | else | ||
| 1056 | cgrp = kn->parent->priv; | ||
| 1057 | |||
| 1058 | mutex_unlock(&cgroup_mutex); | ||
| 1059 | |||
| 1060 | kernfs_unbreak_active_protection(kn); | ||
| 1061 | cgroup_put(cgrp); | ||
| 931 | } | 1062 | } |
| 932 | 1063 | ||
| 933 | static void cgroup_put(struct cgroup *cgrp) | 1064 | /** |
| 1065 | * cgroup_kn_lock_live - locking helper for cgroup kernfs methods | ||
| 1066 | * @kn: the kernfs_node being serviced | ||
| 1067 | * | ||
| 1068 | * This helper is to be used by a cgroup kernfs method currently servicing | ||
| 1069 | * @kn. It breaks the active protection, performs cgroup locking and | ||
| 1070 | * verifies that the associated cgroup is alive. Returns the cgroup if | ||
| 1071 | * alive; otherwise, %NULL. A successful return should be undone by a | ||
| 1072 | * matching cgroup_kn_unlock() invocation. | ||
| 1073 | * | ||
| 1074 | * Any cgroup kernfs method implementation which requires locking the | ||
| 1075 | * associated cgroup should use this helper. It avoids nesting cgroup | ||
| 1076 | * locking under kernfs active protection and allows all kernfs operations | ||
| 1077 | * including self-removal. | ||
| 1078 | */ | ||
| 1079 | static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) | ||
| 934 | { | 1080 | { |
| 935 | if (!atomic_dec_and_test(&cgrp->refcnt)) | 1081 | struct cgroup *cgrp; |
| 936 | return; | 1082 | |
| 937 | if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) | 1083 | if (kernfs_type(kn) == KERNFS_DIR) |
| 938 | return; | 1084 | cgrp = kn->priv; |
| 1085 | else | ||
| 1086 | cgrp = kn->parent->priv; | ||
| 939 | 1087 | ||
| 940 | /* | 1088 | /* |
| 941 | * XXX: cgrp->id is only used to look up css's. As cgroup and | 1089 | * We're gonna grab cgroup_mutex which nests outside kernfs |
| 942 | * css's lifetimes will be decoupled, it should be made | 1090 | * active_ref. cgroup liveliness check alone provides enough |
| 943 | * per-subsystem and moved to css->id so that lookups are | 1091 | * protection against removal. Ensure @cgrp stays accessible and |
| 944 | * successful until the target css is released. | 1092 | * break the active_ref protection. |
| 945 | */ | 1093 | */ |
| 1094 | cgroup_get(cgrp); | ||
| 1095 | kernfs_break_active_protection(kn); | ||
| 1096 | |||
| 946 | mutex_lock(&cgroup_mutex); | 1097 | mutex_lock(&cgroup_mutex); |
| 947 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
| 948 | mutex_unlock(&cgroup_mutex); | ||
| 949 | cgrp->id = -1; | ||
| 950 | 1098 | ||
| 951 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 1099 | if (!cgroup_is_dead(cgrp)) |
| 1100 | return cgrp; | ||
| 1101 | |||
| 1102 | cgroup_kn_unlock(kn); | ||
| 1103 | return NULL; | ||
| 952 | } | 1104 | } |
| 953 | 1105 | ||
| 954 | static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | 1106 | static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
| 955 | { | 1107 | { |
| 956 | char name[CGROUP_FILE_NAME_MAX]; | 1108 | char name[CGROUP_FILE_NAME_MAX]; |
| 957 | 1109 | ||
| 958 | lockdep_assert_held(&cgroup_tree_mutex); | 1110 | lockdep_assert_held(&cgroup_mutex); |
| 959 | kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); | 1111 | kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); |
| 960 | } | 1112 | } |
| 961 | 1113 | ||
| @@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
| 964 | * @cgrp: target cgroup | 1116 | * @cgrp: target cgroup |
| 965 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 1117 | * @subsys_mask: mask of the subsystem ids whose files should be removed |
| 966 | */ | 1118 | */ |
| 967 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) | 1119 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) |
| 968 | { | 1120 | { |
| 969 | struct cgroup_subsys *ss; | 1121 | struct cgroup_subsys *ss; |
| 970 | int i; | 1122 | int i; |
| @@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) | |||
| 972 | for_each_subsys(ss, i) { | 1124 | for_each_subsys(ss, i) { |
| 973 | struct cftype *cfts; | 1125 | struct cftype *cfts; |
| 974 | 1126 | ||
| 975 | if (!test_bit(i, &subsys_mask)) | 1127 | if (!(subsys_mask & (1 << i))) |
| 976 | continue; | 1128 | continue; |
| 977 | list_for_each_entry(cfts, &ss->cfts, node) | 1129 | list_for_each_entry(cfts, &ss->cfts, node) |
| 978 | cgroup_addrm_files(cgrp, cfts, false); | 1130 | cgroup_addrm_files(cgrp, cfts, false); |
| 979 | } | 1131 | } |
| 980 | } | 1132 | } |
| 981 | 1133 | ||
| 982 | static int rebind_subsystems(struct cgroup_root *dst_root, | 1134 | static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) |
| 983 | unsigned long ss_mask) | ||
| 984 | { | 1135 | { |
| 985 | struct cgroup_subsys *ss; | 1136 | struct cgroup_subsys *ss; |
| 986 | int ssid, ret; | 1137 | unsigned int tmp_ss_mask; |
| 1138 | int ssid, i, ret; | ||
| 987 | 1139 | ||
| 988 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 989 | lockdep_assert_held(&cgroup_mutex); | 1140 | lockdep_assert_held(&cgroup_mutex); |
| 990 | 1141 | ||
| 991 | for_each_subsys(ss, ssid) { | 1142 | for_each_subsys(ss, ssid) { |
| 992 | if (!(ss_mask & (1 << ssid))) | 1143 | if (!(ss_mask & (1 << ssid))) |
| 993 | continue; | 1144 | continue; |
| 994 | 1145 | ||
| 995 | /* if @ss is on the dummy_root, we can always move it */ | 1146 | /* if @ss has non-root csses attached to it, can't move */ |
| 996 | if (ss->root == &cgrp_dfl_root) | 1147 | if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) |
| 997 | continue; | ||
| 998 | |||
| 999 | /* if @ss has non-root cgroups attached to it, can't move */ | ||
| 1000 | if (!list_empty(&ss->root->cgrp.children)) | ||
| 1001 | return -EBUSY; | 1148 | return -EBUSY; |
| 1002 | 1149 | ||
| 1003 | /* can't move between two non-dummy roots either */ | 1150 | /* can't move between two non-dummy roots either */ |
| 1004 | if (dst_root != &cgrp_dfl_root) | 1151 | if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) |
| 1005 | return -EBUSY; | 1152 | return -EBUSY; |
| 1006 | } | 1153 | } |
| 1007 | 1154 | ||
| 1008 | ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); | 1155 | /* skip creating root files on dfl_root for inhibited subsystems */ |
| 1156 | tmp_ss_mask = ss_mask; | ||
| 1157 | if (dst_root == &cgrp_dfl_root) | ||
| 1158 | tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; | ||
| 1159 | |||
| 1160 | ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); | ||
| 1009 | if (ret) { | 1161 | if (ret) { |
| 1010 | if (dst_root != &cgrp_dfl_root) | 1162 | if (dst_root != &cgrp_dfl_root) |
| 1011 | return ret; | 1163 | return ret; |
| @@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
| 1017 | * Just warn about it and continue. | 1169 | * Just warn about it and continue. |
| 1018 | */ | 1170 | */ |
| 1019 | if (cgrp_dfl_root_visible) { | 1171 | if (cgrp_dfl_root_visible) { |
| 1020 | pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", | 1172 | pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", |
| 1021 | ret, ss_mask); | 1173 | ret, ss_mask); |
| 1022 | pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); | 1174 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); |
| 1023 | } | 1175 | } |
| 1024 | } | 1176 | } |
| 1025 | 1177 | ||
| @@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
| 1027 | * Nothing can fail from this point on. Remove files for the | 1179 | * Nothing can fail from this point on. Remove files for the |
| 1028 | * removed subsystems and rebind each subsystem. | 1180 | * removed subsystems and rebind each subsystem. |
| 1029 | */ | 1181 | */ |
| 1030 | mutex_unlock(&cgroup_mutex); | ||
| 1031 | for_each_subsys(ss, ssid) | 1182 | for_each_subsys(ss, ssid) |
| 1032 | if (ss_mask & (1 << ssid)) | 1183 | if (ss_mask & (1 << ssid)) |
| 1033 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); | 1184 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); |
| 1034 | mutex_lock(&cgroup_mutex); | ||
| 1035 | 1185 | ||
| 1036 | for_each_subsys(ss, ssid) { | 1186 | for_each_subsys(ss, ssid) { |
| 1037 | struct cgroup_root *src_root; | 1187 | struct cgroup_root *src_root; |
| 1038 | struct cgroup_subsys_state *css; | 1188 | struct cgroup_subsys_state *css; |
| 1189 | struct css_set *cset; | ||
| 1039 | 1190 | ||
| 1040 | if (!(ss_mask & (1 << ssid))) | 1191 | if (!(ss_mask & (1 << ssid))) |
| 1041 | continue; | 1192 | continue; |
| @@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
| 1050 | ss->root = dst_root; | 1201 | ss->root = dst_root; |
| 1051 | css->cgroup = &dst_root->cgrp; | 1202 | css->cgroup = &dst_root->cgrp; |
| 1052 | 1203 | ||
| 1053 | src_root->cgrp.subsys_mask &= ~(1 << ssid); | 1204 | down_write(&css_set_rwsem); |
| 1054 | dst_root->cgrp.subsys_mask |= 1 << ssid; | 1205 | hash_for_each(css_set_table, i, cset, hlist) |
| 1206 | list_move_tail(&cset->e_cset_node[ss->id], | ||
| 1207 | &dst_root->cgrp.e_csets[ss->id]); | ||
| 1208 | up_write(&css_set_rwsem); | ||
| 1209 | |||
| 1210 | src_root->subsys_mask &= ~(1 << ssid); | ||
| 1211 | src_root->cgrp.child_subsys_mask &= ~(1 << ssid); | ||
| 1212 | |||
| 1213 | /* default hierarchy doesn't enable controllers by default */ | ||
| 1214 | dst_root->subsys_mask |= 1 << ssid; | ||
| 1215 | if (dst_root != &cgrp_dfl_root) | ||
| 1216 | dst_root->cgrp.child_subsys_mask |= 1 << ssid; | ||
| 1055 | 1217 | ||
| 1056 | if (ss->bind) | 1218 | if (ss->bind) |
| 1057 | ss->bind(css); | 1219 | ss->bind(css); |
| @@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq, | |||
| 1069 | int ssid; | 1231 | int ssid; |
| 1070 | 1232 | ||
| 1071 | for_each_subsys(ss, ssid) | 1233 | for_each_subsys(ss, ssid) |
| 1072 | if (root->cgrp.subsys_mask & (1 << ssid)) | 1234 | if (root->subsys_mask & (1 << ssid)) |
| 1073 | seq_printf(seq, ",%s", ss->name); | 1235 | seq_printf(seq, ",%s", ss->name); |
| 1074 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) | 1236 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) |
| 1075 | seq_puts(seq, ",sane_behavior"); | 1237 | seq_puts(seq, ",sane_behavior"); |
| @@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq, | |||
| 1091 | } | 1253 | } |
| 1092 | 1254 | ||
| 1093 | struct cgroup_sb_opts { | 1255 | struct cgroup_sb_opts { |
| 1094 | unsigned long subsys_mask; | 1256 | unsigned int subsys_mask; |
| 1095 | unsigned long flags; | 1257 | unsigned int flags; |
| 1096 | char *release_agent; | 1258 | char *release_agent; |
| 1097 | bool cpuset_clone_children; | 1259 | bool cpuset_clone_children; |
| 1098 | char *name; | 1260 | char *name; |
| @@ -1100,24 +1262,16 @@ struct cgroup_sb_opts { | |||
| 1100 | bool none; | 1262 | bool none; |
| 1101 | }; | 1263 | }; |
| 1102 | 1264 | ||
| 1103 | /* | ||
| 1104 | * Convert a hierarchy specifier into a bitmask of subsystems and | ||
| 1105 | * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] | ||
| 1106 | * array. This function takes refcounts on subsystems to be used, unless it | ||
| 1107 | * returns error, in which case no refcounts are taken. | ||
| 1108 | */ | ||
| 1109 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1265 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
| 1110 | { | 1266 | { |
| 1111 | char *token, *o = data; | 1267 | char *token, *o = data; |
| 1112 | bool all_ss = false, one_ss = false; | 1268 | bool all_ss = false, one_ss = false; |
| 1113 | unsigned long mask = (unsigned long)-1; | 1269 | unsigned int mask = -1U; |
| 1114 | struct cgroup_subsys *ss; | 1270 | struct cgroup_subsys *ss; |
| 1115 | int i; | 1271 | int i; |
| 1116 | 1272 | ||
| 1117 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | ||
| 1118 | |||
| 1119 | #ifdef CONFIG_CPUSETS | 1273 | #ifdef CONFIG_CPUSETS |
| 1120 | mask = ~(1UL << cpuset_cgrp_id); | 1274 | mask = ~(1U << cpuset_cgrp_id); |
| 1121 | #endif | 1275 | #endif |
| 1122 | 1276 | ||
| 1123 | memset(opts, 0, sizeof(*opts)); | 1277 | memset(opts, 0, sizeof(*opts)); |
| @@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1198 | /* Mutually exclusive option 'all' + subsystem name */ | 1352 | /* Mutually exclusive option 'all' + subsystem name */ |
| 1199 | if (all_ss) | 1353 | if (all_ss) |
| 1200 | return -EINVAL; | 1354 | return -EINVAL; |
| 1201 | set_bit(i, &opts->subsys_mask); | 1355 | opts->subsys_mask |= (1 << i); |
| 1202 | one_ss = true; | 1356 | one_ss = true; |
| 1203 | 1357 | ||
| 1204 | break; | 1358 | break; |
| @@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1210 | /* Consistency checks */ | 1364 | /* Consistency checks */ |
| 1211 | 1365 | ||
| 1212 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { | 1366 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { |
| 1213 | pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); | 1367 | pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); |
| 1214 | 1368 | ||
| 1215 | if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || | 1369 | if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || |
| 1216 | opts->cpuset_clone_children || opts->release_agent || | 1370 | opts->cpuset_clone_children || opts->release_agent || |
| 1217 | opts->name) { | 1371 | opts->name) { |
| 1218 | pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); | 1372 | pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); |
| 1219 | return -EINVAL; | 1373 | return -EINVAL; |
| 1220 | } | 1374 | } |
| 1221 | } else { | 1375 | } else { |
| @@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1227 | if (all_ss || (!one_ss && !opts->none && !opts->name)) | 1381 | if (all_ss || (!one_ss && !opts->none && !opts->name)) |
| 1228 | for_each_subsys(ss, i) | 1382 | for_each_subsys(ss, i) |
| 1229 | if (!ss->disabled) | 1383 | if (!ss->disabled) |
| 1230 | set_bit(i, &opts->subsys_mask); | 1384 | opts->subsys_mask |= (1 << i); |
| 1231 | 1385 | ||
| 1232 | /* | 1386 | /* |
| 1233 | * We either have to specify by name or by subsystems. (So | 1387 | * We either have to specify by name or by subsystems. (So |
| @@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1258 | int ret = 0; | 1412 | int ret = 0; |
| 1259 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); | 1413 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
| 1260 | struct cgroup_sb_opts opts; | 1414 | struct cgroup_sb_opts opts; |
| 1261 | unsigned long added_mask, removed_mask; | 1415 | unsigned int added_mask, removed_mask; |
| 1262 | 1416 | ||
| 1263 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { | 1417 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { |
| 1264 | pr_err("cgroup: sane_behavior: remount is not allowed\n"); | 1418 | pr_err("sane_behavior: remount is not allowed\n"); |
| 1265 | return -EINVAL; | 1419 | return -EINVAL; |
| 1266 | } | 1420 | } |
| 1267 | 1421 | ||
| 1268 | mutex_lock(&cgroup_tree_mutex); | ||
| 1269 | mutex_lock(&cgroup_mutex); | 1422 | mutex_lock(&cgroup_mutex); |
| 1270 | 1423 | ||
| 1271 | /* See what subsystems are wanted */ | 1424 | /* See what subsystems are wanted */ |
| @@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1273 | if (ret) | 1426 | if (ret) |
| 1274 | goto out_unlock; | 1427 | goto out_unlock; |
| 1275 | 1428 | ||
| 1276 | if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) | 1429 | if (opts.subsys_mask != root->subsys_mask || opts.release_agent) |
| 1277 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1430 | pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", |
| 1278 | task_tgid_nr(current), current->comm); | 1431 | task_tgid_nr(current), current->comm); |
| 1279 | 1432 | ||
| 1280 | added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; | 1433 | added_mask = opts.subsys_mask & ~root->subsys_mask; |
| 1281 | removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; | 1434 | removed_mask = root->subsys_mask & ~opts.subsys_mask; |
| 1282 | 1435 | ||
| 1283 | /* Don't allow flags or name to change at remount */ | 1436 | /* Don't allow flags or name to change at remount */ |
| 1284 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || | 1437 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || |
| 1285 | (opts.name && strcmp(opts.name, root->name))) { | 1438 | (opts.name && strcmp(opts.name, root->name))) { |
| 1286 | pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", | 1439 | pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", |
| 1287 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", | 1440 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", |
| 1288 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); | 1441 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); |
| 1289 | ret = -EINVAL; | 1442 | ret = -EINVAL; |
| @@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1291 | } | 1444 | } |
| 1292 | 1445 | ||
| 1293 | /* remounting is not allowed for populated hierarchies */ | 1446 | /* remounting is not allowed for populated hierarchies */ |
| 1294 | if (!list_empty(&root->cgrp.children)) { | 1447 | if (!list_empty(&root->cgrp.self.children)) { |
| 1295 | ret = -EBUSY; | 1448 | ret = -EBUSY; |
| 1296 | goto out_unlock; | 1449 | goto out_unlock; |
| 1297 | } | 1450 | } |
| @@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1311 | kfree(opts.release_agent); | 1464 | kfree(opts.release_agent); |
| 1312 | kfree(opts.name); | 1465 | kfree(opts.name); |
| 1313 | mutex_unlock(&cgroup_mutex); | 1466 | mutex_unlock(&cgroup_mutex); |
| 1314 | mutex_unlock(&cgroup_tree_mutex); | ||
| 1315 | return ret; | 1467 | return ret; |
| 1316 | } | 1468 | } |
| 1317 | 1469 | ||
| @@ -1369,14 +1521,22 @@ out_unlock: | |||
| 1369 | 1521 | ||
| 1370 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | 1522 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
| 1371 | { | 1523 | { |
| 1372 | atomic_set(&cgrp->refcnt, 1); | 1524 | struct cgroup_subsys *ss; |
| 1373 | INIT_LIST_HEAD(&cgrp->sibling); | 1525 | int ssid; |
| 1374 | INIT_LIST_HEAD(&cgrp->children); | 1526 | |
| 1527 | INIT_LIST_HEAD(&cgrp->self.sibling); | ||
| 1528 | INIT_LIST_HEAD(&cgrp->self.children); | ||
| 1375 | INIT_LIST_HEAD(&cgrp->cset_links); | 1529 | INIT_LIST_HEAD(&cgrp->cset_links); |
| 1376 | INIT_LIST_HEAD(&cgrp->release_list); | 1530 | INIT_LIST_HEAD(&cgrp->release_list); |
| 1377 | INIT_LIST_HEAD(&cgrp->pidlists); | 1531 | INIT_LIST_HEAD(&cgrp->pidlists); |
| 1378 | mutex_init(&cgrp->pidlist_mutex); | 1532 | mutex_init(&cgrp->pidlist_mutex); |
| 1379 | cgrp->dummy_css.cgroup = cgrp; | 1533 | cgrp->self.cgroup = cgrp; |
| 1534 | cgrp->self.flags |= CSS_ONLINE; | ||
| 1535 | |||
| 1536 | for_each_subsys(ss, ssid) | ||
| 1537 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); | ||
| 1538 | |||
| 1539 | init_waitqueue_head(&cgrp->offline_waitq); | ||
| 1380 | } | 1540 | } |
| 1381 | 1541 | ||
| 1382 | static void init_cgroup_root(struct cgroup_root *root, | 1542 | static void init_cgroup_root(struct cgroup_root *root, |
| @@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root, | |||
| 1399 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); | 1559 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
| 1400 | } | 1560 | } |
| 1401 | 1561 | ||
| 1402 | static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | 1562 | static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) |
| 1403 | { | 1563 | { |
| 1404 | LIST_HEAD(tmp_links); | 1564 | LIST_HEAD(tmp_links); |
| 1405 | struct cgroup *root_cgrp = &root->cgrp; | 1565 | struct cgroup *root_cgrp = &root->cgrp; |
| 1406 | struct css_set *cset; | 1566 | struct css_set *cset; |
| 1407 | int i, ret; | 1567 | int i, ret; |
| 1408 | 1568 | ||
| 1409 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 1410 | lockdep_assert_held(&cgroup_mutex); | 1569 | lockdep_assert_held(&cgroup_mutex); |
| 1411 | 1570 | ||
| 1412 | ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); | 1571 | ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); |
| 1413 | if (ret < 0) | 1572 | if (ret < 0) |
| 1414 | goto out; | 1573 | goto out; |
| 1415 | root_cgrp->id = ret; | 1574 | root_cgrp->id = ret; |
| 1416 | 1575 | ||
| 1576 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); | ||
| 1577 | if (ret) | ||
| 1578 | goto out; | ||
| 1579 | |||
| 1417 | /* | 1580 | /* |
| 1418 | * We're accessing css_set_count without locking css_set_rwsem here, | 1581 | * We're accessing css_set_count without locking css_set_rwsem here, |
| 1419 | * but that's OK - it can only be increased by someone holding | 1582 | * but that's OK - it can only be increased by someone holding |
| @@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
| 1422 | */ | 1585 | */ |
| 1423 | ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); | 1586 | ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); |
| 1424 | if (ret) | 1587 | if (ret) |
| 1425 | goto out; | 1588 | goto cancel_ref; |
| 1426 | 1589 | ||
| 1427 | ret = cgroup_init_root_id(root); | 1590 | ret = cgroup_init_root_id(root); |
| 1428 | if (ret) | 1591 | if (ret) |
| 1429 | goto out; | 1592 | goto cancel_ref; |
| 1430 | 1593 | ||
| 1431 | root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, | 1594 | root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, |
| 1432 | KERNFS_ROOT_CREATE_DEACTIVATED, | 1595 | KERNFS_ROOT_CREATE_DEACTIVATED, |
| @@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
| 1462 | link_css_set(&tmp_links, cset, root_cgrp); | 1625 | link_css_set(&tmp_links, cset, root_cgrp); |
| 1463 | up_write(&css_set_rwsem); | 1626 | up_write(&css_set_rwsem); |
| 1464 | 1627 | ||
| 1465 | BUG_ON(!list_empty(&root_cgrp->children)); | 1628 | BUG_ON(!list_empty(&root_cgrp->self.children)); |
| 1466 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); | 1629 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); |
| 1467 | 1630 | ||
| 1468 | kernfs_activate(root_cgrp->kn); | 1631 | kernfs_activate(root_cgrp->kn); |
| @@ -1474,6 +1637,8 @@ destroy_root: | |||
| 1474 | root->kf_root = NULL; | 1637 | root->kf_root = NULL; |
| 1475 | exit_root_id: | 1638 | exit_root_id: |
| 1476 | cgroup_exit_root_id(root); | 1639 | cgroup_exit_root_id(root); |
| 1640 | cancel_ref: | ||
| 1641 | percpu_ref_cancel_init(&root_cgrp->self.refcnt); | ||
| 1477 | out: | 1642 | out: |
| 1478 | free_cgrp_cset_links(&tmp_links); | 1643 | free_cgrp_cset_links(&tmp_links); |
| 1479 | return ret; | 1644 | return ret; |
| @@ -1495,8 +1660,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1495 | */ | 1660 | */ |
| 1496 | if (!use_task_css_set_links) | 1661 | if (!use_task_css_set_links) |
| 1497 | cgroup_enable_task_cg_lists(); | 1662 | cgroup_enable_task_cg_lists(); |
| 1498 | retry: | 1663 | |
| 1499 | mutex_lock(&cgroup_tree_mutex); | ||
| 1500 | mutex_lock(&cgroup_mutex); | 1664 | mutex_lock(&cgroup_mutex); |
| 1501 | 1665 | ||
| 1502 | /* First find the desired set of subsystems */ | 1666 | /* First find the desired set of subsystems */ |
| @@ -1535,7 +1699,7 @@ retry: | |||
| 1535 | * subsystems) then they must match. | 1699 | * subsystems) then they must match. |
| 1536 | */ | 1700 | */ |
| 1537 | if ((opts.subsys_mask || opts.none) && | 1701 | if ((opts.subsys_mask || opts.none) && |
| 1538 | (opts.subsys_mask != root->cgrp.subsys_mask)) { | 1702 | (opts.subsys_mask != root->subsys_mask)) { |
| 1539 | if (!name_match) | 1703 | if (!name_match) |
| 1540 | continue; | 1704 | continue; |
| 1541 | ret = -EBUSY; | 1705 | ret = -EBUSY; |
| @@ -1544,28 +1708,27 @@ retry: | |||
| 1544 | 1708 | ||
| 1545 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { | 1709 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { |
| 1546 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { | 1710 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { |
| 1547 | pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); | 1711 | pr_err("sane_behavior: new mount options should match the existing superblock\n"); |
| 1548 | ret = -EINVAL; | 1712 | ret = -EINVAL; |
| 1549 | goto out_unlock; | 1713 | goto out_unlock; |
| 1550 | } else { | 1714 | } else { |
| 1551 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); | 1715 | pr_warn("new mount options do not match the existing superblock, will be ignored\n"); |
| 1552 | } | 1716 | } |
| 1553 | } | 1717 | } |
| 1554 | 1718 | ||
| 1555 | /* | 1719 | /* |
| 1556 | * A root's lifetime is governed by its root cgroup. Zero | 1720 | * A root's lifetime is governed by its root cgroup. |
| 1557 | * ref indicate that the root is being destroyed. Wait for | 1721 | * tryget_live failure indicate that the root is being |
| 1558 | * destruction to complete so that the subsystems are free. | 1722 | * destroyed. Wait for destruction to complete so that the |
| 1559 | * We can use wait_queue for the wait but this path is | 1723 | * subsystems are free. We can use wait_queue for the wait |
| 1560 | * super cold. Let's just sleep for a bit and retry. | 1724 | * but this path is super cold. Let's just sleep for a bit |
| 1725 | * and retry. | ||
| 1561 | */ | 1726 | */ |
| 1562 | if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { | 1727 | if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { |
| 1563 | mutex_unlock(&cgroup_mutex); | 1728 | mutex_unlock(&cgroup_mutex); |
| 1564 | mutex_unlock(&cgroup_tree_mutex); | ||
| 1565 | kfree(opts.release_agent); | ||
| 1566 | kfree(opts.name); | ||
| 1567 | msleep(10); | 1729 | msleep(10); |
| 1568 | goto retry; | 1730 | ret = restart_syscall(); |
| 1731 | goto out_free; | ||
| 1569 | } | 1732 | } |
| 1570 | 1733 | ||
| 1571 | ret = 0; | 1734 | ret = 0; |
| @@ -1596,15 +1759,15 @@ retry: | |||
| 1596 | 1759 | ||
| 1597 | out_unlock: | 1760 | out_unlock: |
| 1598 | mutex_unlock(&cgroup_mutex); | 1761 | mutex_unlock(&cgroup_mutex); |
| 1599 | mutex_unlock(&cgroup_tree_mutex); | 1762 | out_free: |
| 1600 | |||
| 1601 | kfree(opts.release_agent); | 1763 | kfree(opts.release_agent); |
| 1602 | kfree(opts.name); | 1764 | kfree(opts.name); |
| 1603 | 1765 | ||
| 1604 | if (ret) | 1766 | if (ret) |
| 1605 | return ERR_PTR(ret); | 1767 | return ERR_PTR(ret); |
| 1606 | 1768 | ||
| 1607 | dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); | 1769 | dentry = kernfs_mount(fs_type, flags, root->kf_root, |
| 1770 | CGROUP_SUPER_MAGIC, &new_sb); | ||
| 1608 | if (IS_ERR(dentry) || !new_sb) | 1771 | if (IS_ERR(dentry) || !new_sb) |
| 1609 | cgroup_put(&root->cgrp); | 1772 | cgroup_put(&root->cgrp); |
| 1610 | return dentry; | 1773 | return dentry; |
| @@ -1615,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb) | |||
| 1615 | struct kernfs_root *kf_root = kernfs_root_from_sb(sb); | 1778 | struct kernfs_root *kf_root = kernfs_root_from_sb(sb); |
| 1616 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); | 1779 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
| 1617 | 1780 | ||
| 1618 | cgroup_put(&root->cgrp); | 1781 | /* |
| 1782 | * If @root doesn't have any mounts or children, start killing it. | ||
| 1783 | * This prevents new mounts by disabling percpu_ref_tryget_live(). | ||
| 1784 | * cgroup_mount() may wait for @root's release. | ||
| 1785 | * | ||
| 1786 | * And don't kill the default root. | ||
| 1787 | */ | ||
| 1788 | if (css_has_online_children(&root->cgrp.self) || | ||
| 1789 | root == &cgrp_dfl_root) | ||
| 1790 | cgroup_put(&root->cgrp); | ||
| 1791 | else | ||
| 1792 | percpu_ref_kill(&root->cgrp.self.refcnt); | ||
| 1793 | |||
| 1619 | kernfs_kill_sb(sb); | 1794 | kernfs_kill_sb(sb); |
| 1620 | } | 1795 | } |
| 1621 | 1796 | ||
| @@ -1737,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
| 1737 | 1912 | ||
| 1738 | /** | 1913 | /** |
| 1739 | * cgroup_task_migrate - move a task from one cgroup to another. | 1914 | * cgroup_task_migrate - move a task from one cgroup to another. |
| 1740 | * @old_cgrp; the cgroup @tsk is being migrated from | 1915 | * @old_cgrp: the cgroup @tsk is being migrated from |
| 1741 | * @tsk: the task being migrated | 1916 | * @tsk: the task being migrated |
| 1742 | * @new_cset: the new css_set @tsk is being attached to | 1917 | * @new_cset: the new css_set @tsk is being attached to |
| 1743 | * | 1918 | * |
| @@ -1829,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
| 1829 | 2004 | ||
| 1830 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); | 2005 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); |
| 1831 | 2006 | ||
| 1832 | /* nothing to do if this cset already belongs to the cgroup */ | ||
| 1833 | if (src_cgrp == dst_cgrp) | ||
| 1834 | return; | ||
| 1835 | |||
| 1836 | if (!list_empty(&src_cset->mg_preload_node)) | 2007 | if (!list_empty(&src_cset->mg_preload_node)) |
| 1837 | return; | 2008 | return; |
| 1838 | 2009 | ||
| @@ -1847,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
| 1847 | 2018 | ||
| 1848 | /** | 2019 | /** |
| 1849 | * cgroup_migrate_prepare_dst - prepare destination css_sets for migration | 2020 | * cgroup_migrate_prepare_dst - prepare destination css_sets for migration |
| 1850 | * @dst_cgrp: the destination cgroup | 2021 | * @dst_cgrp: the destination cgroup (may be %NULL) |
| 1851 | * @preloaded_csets: list of preloaded source css_sets | 2022 | * @preloaded_csets: list of preloaded source css_sets |
| 1852 | * | 2023 | * |
| 1853 | * Tasks are about to be moved to @dst_cgrp and all the source css_sets | 2024 | * Tasks are about to be moved to @dst_cgrp and all the source css_sets |
| 1854 | * have been preloaded to @preloaded_csets. This function looks up and | 2025 | * have been preloaded to @preloaded_csets. This function looks up and |
| 1855 | * pins all destination css_sets, links each to its source, and put them on | 2026 | * pins all destination css_sets, links each to its source, and append them |
| 1856 | * @preloaded_csets. | 2027 | * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each |
| 2028 | * source css_set is assumed to be its cgroup on the default hierarchy. | ||
| 1857 | * | 2029 | * |
| 1858 | * This function must be called after cgroup_migrate_add_src() has been | 2030 | * This function must be called after cgroup_migrate_add_src() has been |
| 1859 | * called on each migration source css_set. After migration is performed | 2031 | * called on each migration source css_set. After migration is performed |
| @@ -1864,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
| 1864 | struct list_head *preloaded_csets) | 2036 | struct list_head *preloaded_csets) |
| 1865 | { | 2037 | { |
| 1866 | LIST_HEAD(csets); | 2038 | LIST_HEAD(csets); |
| 1867 | struct css_set *src_cset; | 2039 | struct css_set *src_cset, *tmp_cset; |
| 1868 | 2040 | ||
| 1869 | lockdep_assert_held(&cgroup_mutex); | 2041 | lockdep_assert_held(&cgroup_mutex); |
| 1870 | 2042 | ||
| 2043 | /* | ||
| 2044 | * Except for the root, child_subsys_mask must be zero for a cgroup | ||
| 2045 | * with tasks so that child cgroups don't compete against tasks. | ||
| 2046 | */ | ||
| 2047 | if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && | ||
| 2048 | dst_cgrp->child_subsys_mask) | ||
| 2049 | return -EBUSY; | ||
| 2050 | |||
| 1871 | /* look up the dst cset for each src cset and link it to src */ | 2051 | /* look up the dst cset for each src cset and link it to src */ |
| 1872 | list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { | 2052 | list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { |
| 1873 | struct css_set *dst_cset; | 2053 | struct css_set *dst_cset; |
| 1874 | 2054 | ||
| 1875 | dst_cset = find_css_set(src_cset, dst_cgrp); | 2055 | dst_cset = find_css_set(src_cset, |
| 2056 | dst_cgrp ?: src_cset->dfl_cgrp); | ||
| 1876 | if (!dst_cset) | 2057 | if (!dst_cset) |
| 1877 | goto err; | 2058 | goto err; |
| 1878 | 2059 | ||
| 1879 | WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); | 2060 | WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); |
| 2061 | |||
| 2062 | /* | ||
| 2063 | * If src cset equals dst, it's noop. Drop the src. | ||
| 2064 | * cgroup_migrate() will skip the cset too. Note that we | ||
| 2065 | * can't handle src == dst as some nodes are used by both. | ||
| 2066 | */ | ||
| 2067 | if (src_cset == dst_cset) { | ||
| 2068 | src_cset->mg_src_cgrp = NULL; | ||
| 2069 | list_del_init(&src_cset->mg_preload_node); | ||
| 2070 | put_css_set(src_cset, false); | ||
| 2071 | put_css_set(dst_cset, false); | ||
| 2072 | continue; | ||
| 2073 | } | ||
| 2074 | |||
| 1880 | src_cset->mg_dst_cset = dst_cset; | 2075 | src_cset->mg_dst_cset = dst_cset; |
| 1881 | 2076 | ||
| 1882 | if (list_empty(&dst_cset->mg_preload_node)) | 2077 | if (list_empty(&dst_cset->mg_preload_node)) |
| @@ -1885,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
| 1885 | put_css_set(dst_cset, false); | 2080 | put_css_set(dst_cset, false); |
| 1886 | } | 2081 | } |
| 1887 | 2082 | ||
| 1888 | list_splice(&csets, preloaded_csets); | 2083 | list_splice_tail(&csets, preloaded_csets); |
| 1889 | return 0; | 2084 | return 0; |
| 1890 | err: | 2085 | err: |
| 1891 | cgroup_migrate_finish(&csets); | 2086 | cgroup_migrate_finish(&csets); |
| @@ -1966,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
| 1966 | return 0; | 2161 | return 0; |
| 1967 | 2162 | ||
| 1968 | /* check that we can legitimately attach to the cgroup */ | 2163 | /* check that we can legitimately attach to the cgroup */ |
| 1969 | for_each_css(css, i, cgrp) { | 2164 | for_each_e_css(css, i, cgrp) { |
| 1970 | if (css->ss->can_attach) { | 2165 | if (css->ss->can_attach) { |
| 1971 | ret = css->ss->can_attach(css, &tset); | 2166 | ret = css->ss->can_attach(css, &tset); |
| 1972 | if (ret) { | 2167 | if (ret) { |
| @@ -1996,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
| 1996 | */ | 2191 | */ |
| 1997 | tset.csets = &tset.dst_csets; | 2192 | tset.csets = &tset.dst_csets; |
| 1998 | 2193 | ||
| 1999 | for_each_css(css, i, cgrp) | 2194 | for_each_e_css(css, i, cgrp) |
| 2000 | if (css->ss->attach) | 2195 | if (css->ss->attach) |
| 2001 | css->ss->attach(css, &tset); | 2196 | css->ss->attach(css, &tset); |
| 2002 | 2197 | ||
| @@ -2004,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
| 2004 | goto out_release_tset; | 2199 | goto out_release_tset; |
| 2005 | 2200 | ||
| 2006 | out_cancel_attach: | 2201 | out_cancel_attach: |
| 2007 | for_each_css(css, i, cgrp) { | 2202 | for_each_e_css(css, i, cgrp) { |
| 2008 | if (css == failed_css) | 2203 | if (css == failed_css) |
| 2009 | break; | 2204 | break; |
| 2010 | if (css->ss->cancel_attach) | 2205 | if (css->ss->cancel_attach) |
| @@ -2063,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
| 2063 | * function to attach either it or all tasks in its threadgroup. Will lock | 2258 | * function to attach either it or all tasks in its threadgroup. Will lock |
| 2064 | * cgroup_mutex and threadgroup. | 2259 | * cgroup_mutex and threadgroup. |
| 2065 | */ | 2260 | */ |
| 2066 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | 2261 | static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, |
| 2262 | size_t nbytes, loff_t off, bool threadgroup) | ||
| 2067 | { | 2263 | { |
| 2068 | struct task_struct *tsk; | 2264 | struct task_struct *tsk; |
| 2069 | const struct cred *cred = current_cred(), *tcred; | 2265 | const struct cred *cred = current_cred(), *tcred; |
| 2266 | struct cgroup *cgrp; | ||
| 2267 | pid_t pid; | ||
| 2070 | int ret; | 2268 | int ret; |
| 2071 | 2269 | ||
| 2072 | if (!cgroup_lock_live_group(cgrp)) | 2270 | if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) |
| 2271 | return -EINVAL; | ||
| 2272 | |||
| 2273 | cgrp = cgroup_kn_lock_live(of->kn); | ||
| 2274 | if (!cgrp) | ||
| 2073 | return -ENODEV; | 2275 | return -ENODEV; |
| 2074 | 2276 | ||
| 2075 | retry_find_task: | 2277 | retry_find_task: |
| @@ -2135,8 +2337,8 @@ retry_find_task: | |||
| 2135 | 2337 | ||
| 2136 | put_task_struct(tsk); | 2338 | put_task_struct(tsk); |
| 2137 | out_unlock_cgroup: | 2339 | out_unlock_cgroup: |
| 2138 | mutex_unlock(&cgroup_mutex); | 2340 | cgroup_kn_unlock(of->kn); |
| 2139 | return ret; | 2341 | return ret ?: nbytes; |
| 2140 | } | 2342 | } |
| 2141 | 2343 | ||
| 2142 | /** | 2344 | /** |
| @@ -2170,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
| 2170 | } | 2372 | } |
| 2171 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 2373 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
| 2172 | 2374 | ||
| 2173 | static int cgroup_tasks_write(struct cgroup_subsys_state *css, | 2375 | static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, |
| 2174 | struct cftype *cft, u64 pid) | 2376 | char *buf, size_t nbytes, loff_t off) |
| 2175 | { | 2377 | { |
| 2176 | return attach_task_by_pid(css->cgroup, pid, false); | 2378 | return __cgroup_procs_write(of, buf, nbytes, off, false); |
| 2177 | } | 2379 | } |
| 2178 | 2380 | ||
| 2179 | static int cgroup_procs_write(struct cgroup_subsys_state *css, | 2381 | static ssize_t cgroup_procs_write(struct kernfs_open_file *of, |
| 2180 | struct cftype *cft, u64 tgid) | 2382 | char *buf, size_t nbytes, loff_t off) |
| 2181 | { | 2383 | { |
| 2182 | return attach_task_by_pid(css->cgroup, tgid, true); | 2384 | return __cgroup_procs_write(of, buf, nbytes, off, true); |
| 2183 | } | 2385 | } |
| 2184 | 2386 | ||
| 2185 | static int cgroup_release_agent_write(struct cgroup_subsys_state *css, | 2387 | static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, |
| 2186 | struct cftype *cft, char *buffer) | 2388 | char *buf, size_t nbytes, loff_t off) |
| 2187 | { | 2389 | { |
| 2188 | struct cgroup_root *root = css->cgroup->root; | 2390 | struct cgroup *cgrp; |
| 2189 | 2391 | ||
| 2190 | BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); | 2392 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
| 2191 | if (!cgroup_lock_live_group(css->cgroup)) | 2393 | |
| 2394 | cgrp = cgroup_kn_lock_live(of->kn); | ||
| 2395 | if (!cgrp) | ||
| 2192 | return -ENODEV; | 2396 | return -ENODEV; |
| 2193 | spin_lock(&release_agent_path_lock); | 2397 | spin_lock(&release_agent_path_lock); |
| 2194 | strlcpy(root->release_agent_path, buffer, | 2398 | strlcpy(cgrp->root->release_agent_path, strstrip(buf), |
| 2195 | sizeof(root->release_agent_path)); | 2399 | sizeof(cgrp->root->release_agent_path)); |
| 2196 | spin_unlock(&release_agent_path_lock); | 2400 | spin_unlock(&release_agent_path_lock); |
| 2197 | mutex_unlock(&cgroup_mutex); | 2401 | cgroup_kn_unlock(of->kn); |
| 2198 | return 0; | 2402 | return nbytes; |
| 2199 | } | 2403 | } |
| 2200 | 2404 | ||
| 2201 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) | 2405 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) |
| 2202 | { | 2406 | { |
| 2203 | struct cgroup *cgrp = seq_css(seq)->cgroup; | 2407 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
| 2204 | 2408 | ||
| 2205 | if (!cgroup_lock_live_group(cgrp)) | 2409 | spin_lock(&release_agent_path_lock); |
| 2206 | return -ENODEV; | ||
| 2207 | seq_puts(seq, cgrp->root->release_agent_path); | 2410 | seq_puts(seq, cgrp->root->release_agent_path); |
| 2411 | spin_unlock(&release_agent_path_lock); | ||
| 2208 | seq_putc(seq, '\n'); | 2412 | seq_putc(seq, '\n'); |
| 2209 | mutex_unlock(&cgroup_mutex); | ||
| 2210 | return 0; | 2413 | return 0; |
| 2211 | } | 2414 | } |
| 2212 | 2415 | ||
| @@ -2218,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) | |||
| 2218 | return 0; | 2421 | return 0; |
| 2219 | } | 2422 | } |
| 2220 | 2423 | ||
| 2424 | static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) | ||
| 2425 | { | ||
| 2426 | struct cgroup_subsys *ss; | ||
| 2427 | bool printed = false; | ||
| 2428 | int ssid; | ||
| 2429 | |||
| 2430 | for_each_subsys(ss, ssid) { | ||
| 2431 | if (ss_mask & (1 << ssid)) { | ||
| 2432 | if (printed) | ||
| 2433 | seq_putc(seq, ' '); | ||
| 2434 | seq_printf(seq, "%s", ss->name); | ||
| 2435 | printed = true; | ||
| 2436 | } | ||
| 2437 | } | ||
| 2438 | if (printed) | ||
| 2439 | seq_putc(seq, '\n'); | ||
| 2440 | } | ||
| 2441 | |||
| 2442 | /* show controllers which are currently attached to the default hierarchy */ | ||
| 2443 | static int cgroup_root_controllers_show(struct seq_file *seq, void *v) | ||
| 2444 | { | ||
| 2445 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
| 2446 | |||
| 2447 | cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & | ||
| 2448 | ~cgrp_dfl_root_inhibit_ss_mask); | ||
| 2449 | return 0; | ||
| 2450 | } | ||
| 2451 | |||
| 2452 | /* show controllers which are enabled from the parent */ | ||
| 2453 | static int cgroup_controllers_show(struct seq_file *seq, void *v) | ||
| 2454 | { | ||
| 2455 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
| 2456 | |||
| 2457 | cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); | ||
| 2458 | return 0; | ||
| 2459 | } | ||
| 2460 | |||
| 2461 | /* show controllers which are enabled for a given cgroup's children */ | ||
| 2462 | static int cgroup_subtree_control_show(struct seq_file *seq, void *v) | ||
| 2463 | { | ||
| 2464 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
| 2465 | |||
| 2466 | cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); | ||
| 2467 | return 0; | ||
| 2468 | } | ||
| 2469 | |||
| 2470 | /** | ||
| 2471 | * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy | ||
| 2472 | * @cgrp: root of the subtree to update csses for | ||
| 2473 | * | ||
| 2474 | * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) | ||
| 2475 | * css associations need to be updated accordingly. This function looks up | ||
| 2476 | * all css_sets which are attached to the subtree, creates the matching | ||
| 2477 | * updated css_sets and migrates the tasks to the new ones. | ||
| 2478 | */ | ||
| 2479 | static int cgroup_update_dfl_csses(struct cgroup *cgrp) | ||
| 2480 | { | ||
| 2481 | LIST_HEAD(preloaded_csets); | ||
| 2482 | struct cgroup_subsys_state *css; | ||
| 2483 | struct css_set *src_cset; | ||
| 2484 | int ret; | ||
| 2485 | |||
| 2486 | lockdep_assert_held(&cgroup_mutex); | ||
| 2487 | |||
| 2488 | /* look up all csses currently attached to @cgrp's subtree */ | ||
| 2489 | down_read(&css_set_rwsem); | ||
| 2490 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { | ||
| 2491 | struct cgrp_cset_link *link; | ||
| 2492 | |||
| 2493 | /* self is not affected by child_subsys_mask change */ | ||
| 2494 | if (css->cgroup == cgrp) | ||
| 2495 | continue; | ||
| 2496 | |||
| 2497 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) | ||
| 2498 | cgroup_migrate_add_src(link->cset, cgrp, | ||
| 2499 | &preloaded_csets); | ||
| 2500 | } | ||
| 2501 | up_read(&css_set_rwsem); | ||
| 2502 | |||
| 2503 | /* NULL dst indicates self on default hierarchy */ | ||
| 2504 | ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); | ||
| 2505 | if (ret) | ||
| 2506 | goto out_finish; | ||
| 2507 | |||
| 2508 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { | ||
| 2509 | struct task_struct *last_task = NULL, *task; | ||
| 2510 | |||
| 2511 | /* src_csets precede dst_csets, break on the first dst_cset */ | ||
| 2512 | if (!src_cset->mg_src_cgrp) | ||
| 2513 | break; | ||
| 2514 | |||
| 2515 | /* | ||
| 2516 | * All tasks in src_cset need to be migrated to the | ||
| 2517 | * matching dst_cset. Empty it process by process. We | ||
| 2518 | * walk tasks but migrate processes. The leader might even | ||
| 2519 | * belong to a different cset but such src_cset would also | ||
| 2520 | * be among the target src_csets because the default | ||
| 2521 | * hierarchy enforces per-process membership. | ||
| 2522 | */ | ||
| 2523 | while (true) { | ||
| 2524 | down_read(&css_set_rwsem); | ||
| 2525 | task = list_first_entry_or_null(&src_cset->tasks, | ||
| 2526 | struct task_struct, cg_list); | ||
| 2527 | if (task) { | ||
| 2528 | task = task->group_leader; | ||
| 2529 | WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); | ||
| 2530 | get_task_struct(task); | ||
| 2531 | } | ||
| 2532 | up_read(&css_set_rwsem); | ||
| 2533 | |||
| 2534 | if (!task) | ||
| 2535 | break; | ||
| 2536 | |||
| 2537 | /* guard against possible infinite loop */ | ||
| 2538 | if (WARN(last_task == task, | ||
| 2539 | "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) | ||
| 2540 | goto out_finish; | ||
| 2541 | last_task = task; | ||
| 2542 | |||
| 2543 | threadgroup_lock(task); | ||
| 2544 | /* raced against de_thread() from another thread? */ | ||
| 2545 | if (!thread_group_leader(task)) { | ||
| 2546 | threadgroup_unlock(task); | ||
| 2547 | put_task_struct(task); | ||
| 2548 | continue; | ||
| 2549 | } | ||
| 2550 | |||
| 2551 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); | ||
| 2552 | |||
| 2553 | threadgroup_unlock(task); | ||
| 2554 | put_task_struct(task); | ||
| 2555 | |||
| 2556 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) | ||
| 2557 | goto out_finish; | ||
| 2558 | } | ||
| 2559 | } | ||
| 2560 | |||
| 2561 | out_finish: | ||
| 2562 | cgroup_migrate_finish(&preloaded_csets); | ||
| 2563 | return ret; | ||
| 2564 | } | ||
| 2565 | |||
| 2566 | /* change the enabled child controllers for a cgroup in the default hierarchy */ | ||
| 2567 | static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | ||
| 2568 | char *buf, size_t nbytes, | ||
| 2569 | loff_t off) | ||
| 2570 | { | ||
| 2571 | unsigned int enable = 0, disable = 0; | ||
| 2572 | struct cgroup *cgrp, *child; | ||
| 2573 | struct cgroup_subsys *ss; | ||
| 2574 | char *tok; | ||
| 2575 | int ssid, ret; | ||
| 2576 | |||
| 2577 | /* | ||
| 2578 | * Parse input - space separated list of subsystem names prefixed | ||
| 2579 | * with either + or -. | ||
| 2580 | */ | ||
| 2581 | buf = strstrip(buf); | ||
| 2582 | while ((tok = strsep(&buf, " "))) { | ||
| 2583 | if (tok[0] == '\0') | ||
| 2584 | continue; | ||
| 2585 | for_each_subsys(ss, ssid) { | ||
| 2586 | if (ss->disabled || strcmp(tok + 1, ss->name) || | ||
| 2587 | ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) | ||
| 2588 | continue; | ||
| 2589 | |||
| 2590 | if (*tok == '+') { | ||
| 2591 | enable |= 1 << ssid; | ||
| 2592 | disable &= ~(1 << ssid); | ||
| 2593 | } else if (*tok == '-') { | ||
| 2594 | disable |= 1 << ssid; | ||
| 2595 | enable &= ~(1 << ssid); | ||
| 2596 | } else { | ||
| 2597 | return -EINVAL; | ||
| 2598 | } | ||
| 2599 | break; | ||
| 2600 | } | ||
| 2601 | if (ssid == CGROUP_SUBSYS_COUNT) | ||
| 2602 | return -EINVAL; | ||
| 2603 | } | ||
| 2604 | |||
| 2605 | cgrp = cgroup_kn_lock_live(of->kn); | ||
| 2606 | if (!cgrp) | ||
| 2607 | return -ENODEV; | ||
| 2608 | |||
| 2609 | for_each_subsys(ss, ssid) { | ||
| 2610 | if (enable & (1 << ssid)) { | ||
| 2611 | if (cgrp->child_subsys_mask & (1 << ssid)) { | ||
| 2612 | enable &= ~(1 << ssid); | ||
| 2613 | continue; | ||
| 2614 | } | ||
| 2615 | |||
| 2616 | /* | ||
| 2617 | * Because css offlining is asynchronous, userland | ||
| 2618 | * might try to re-enable the same controller while | ||
| 2619 | * the previous instance is still around. In such | ||
| 2620 | * cases, wait till it's gone using offline_waitq. | ||
| 2621 | */ | ||
| 2622 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2623 | DEFINE_WAIT(wait); | ||
| 2624 | |||
| 2625 | if (!cgroup_css(child, ss)) | ||
| 2626 | continue; | ||
| 2627 | |||
| 2628 | cgroup_get(child); | ||
| 2629 | prepare_to_wait(&child->offline_waitq, &wait, | ||
| 2630 | TASK_UNINTERRUPTIBLE); | ||
| 2631 | cgroup_kn_unlock(of->kn); | ||
| 2632 | schedule(); | ||
| 2633 | finish_wait(&child->offline_waitq, &wait); | ||
| 2634 | cgroup_put(child); | ||
| 2635 | |||
| 2636 | return restart_syscall(); | ||
| 2637 | } | ||
| 2638 | |||
| 2639 | /* unavailable or not enabled on the parent? */ | ||
| 2640 | if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || | ||
| 2641 | (cgroup_parent(cgrp) && | ||
| 2642 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { | ||
| 2643 | ret = -ENOENT; | ||
| 2644 | goto out_unlock; | ||
| 2645 | } | ||
| 2646 | } else if (disable & (1 << ssid)) { | ||
| 2647 | if (!(cgrp->child_subsys_mask & (1 << ssid))) { | ||
| 2648 | disable &= ~(1 << ssid); | ||
| 2649 | continue; | ||
| 2650 | } | ||
| 2651 | |||
| 2652 | /* a child has it enabled? */ | ||
| 2653 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2654 | if (child->child_subsys_mask & (1 << ssid)) { | ||
| 2655 | ret = -EBUSY; | ||
| 2656 | goto out_unlock; | ||
| 2657 | } | ||
| 2658 | } | ||
| 2659 | } | ||
| 2660 | } | ||
| 2661 | |||
| 2662 | if (!enable && !disable) { | ||
| 2663 | ret = 0; | ||
| 2664 | goto out_unlock; | ||
| 2665 | } | ||
| 2666 | |||
| 2667 | /* | ||
| 2668 | * Except for the root, child_subsys_mask must be zero for a cgroup | ||
| 2669 | * with tasks so that child cgroups don't compete against tasks. | ||
| 2670 | */ | ||
| 2671 | if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { | ||
| 2672 | ret = -EBUSY; | ||
| 2673 | goto out_unlock; | ||
| 2674 | } | ||
| 2675 | |||
| 2676 | /* | ||
| 2677 | * Create csses for enables and update child_subsys_mask. This | ||
| 2678 | * changes cgroup_e_css() results which in turn makes the | ||
| 2679 | * subsequent cgroup_update_dfl_csses() associate all tasks in the | ||
| 2680 | * subtree to the updated csses. | ||
| 2681 | */ | ||
| 2682 | for_each_subsys(ss, ssid) { | ||
| 2683 | if (!(enable & (1 << ssid))) | ||
| 2684 | continue; | ||
| 2685 | |||
| 2686 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2687 | ret = create_css(child, ss); | ||
| 2688 | if (ret) | ||
| 2689 | goto err_undo_css; | ||
| 2690 | } | ||
| 2691 | } | ||
| 2692 | |||
| 2693 | cgrp->child_subsys_mask |= enable; | ||
| 2694 | cgrp->child_subsys_mask &= ~disable; | ||
| 2695 | |||
| 2696 | ret = cgroup_update_dfl_csses(cgrp); | ||
| 2697 | if (ret) | ||
| 2698 | goto err_undo_css; | ||
| 2699 | |||
| 2700 | /* all tasks are now migrated away from the old csses, kill them */ | ||
| 2701 | for_each_subsys(ss, ssid) { | ||
| 2702 | if (!(disable & (1 << ssid))) | ||
| 2703 | continue; | ||
| 2704 | |||
| 2705 | cgroup_for_each_live_child(child, cgrp) | ||
| 2706 | kill_css(cgroup_css(child, ss)); | ||
| 2707 | } | ||
| 2708 | |||
| 2709 | kernfs_activate(cgrp->kn); | ||
| 2710 | ret = 0; | ||
| 2711 | out_unlock: | ||
| 2712 | cgroup_kn_unlock(of->kn); | ||
| 2713 | return ret ?: nbytes; | ||
| 2714 | |||
| 2715 | err_undo_css: | ||
| 2716 | cgrp->child_subsys_mask &= ~enable; | ||
| 2717 | cgrp->child_subsys_mask |= disable; | ||
| 2718 | |||
| 2719 | for_each_subsys(ss, ssid) { | ||
| 2720 | if (!(enable & (1 << ssid))) | ||
| 2721 | continue; | ||
| 2722 | |||
| 2723 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2724 | struct cgroup_subsys_state *css = cgroup_css(child, ss); | ||
| 2725 | if (css) | ||
| 2726 | kill_css(css); | ||
| 2727 | } | ||
| 2728 | } | ||
| 2729 | goto out_unlock; | ||
| 2730 | } | ||
| 2731 | |||
| 2732 | static int cgroup_populated_show(struct seq_file *seq, void *v) | ||
| 2733 | { | ||
| 2734 | seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); | ||
| 2735 | return 0; | ||
| 2736 | } | ||
| 2737 | |||
| 2221 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | 2738 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, |
| 2222 | size_t nbytes, loff_t off) | 2739 | size_t nbytes, loff_t off) |
| 2223 | { | 2740 | { |
| @@ -2226,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
| 2226 | struct cgroup_subsys_state *css; | 2743 | struct cgroup_subsys_state *css; |
| 2227 | int ret; | 2744 | int ret; |
| 2228 | 2745 | ||
| 2746 | if (cft->write) | ||
| 2747 | return cft->write(of, buf, nbytes, off); | ||
| 2748 | |||
| 2229 | /* | 2749 | /* |
| 2230 | * kernfs guarantees that a file isn't deleted with operations in | 2750 | * kernfs guarantees that a file isn't deleted with operations in |
| 2231 | * flight, which means that the matching css is and stays alive and | 2751 | * flight, which means that the matching css is and stays alive and |
| @@ -2236,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
| 2236 | css = cgroup_css(cgrp, cft->ss); | 2756 | css = cgroup_css(cgrp, cft->ss); |
| 2237 | rcu_read_unlock(); | 2757 | rcu_read_unlock(); |
| 2238 | 2758 | ||
| 2239 | if (cft->write_string) { | 2759 | if (cft->write_u64) { |
| 2240 | ret = cft->write_string(css, cft, strstrip(buf)); | ||
| 2241 | } else if (cft->write_u64) { | ||
| 2242 | unsigned long long v; | 2760 | unsigned long long v; |
| 2243 | ret = kstrtoull(buf, 0, &v); | 2761 | ret = kstrtoull(buf, 0, &v); |
| 2244 | if (!ret) | 2762 | if (!ret) |
| @@ -2248,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
| 2248 | ret = kstrtoll(buf, 0, &v); | 2766 | ret = kstrtoll(buf, 0, &v); |
| 2249 | if (!ret) | 2767 | if (!ret) |
| 2250 | ret = cft->write_s64(css, cft, v); | 2768 | ret = cft->write_s64(css, cft, v); |
| 2251 | } else if (cft->trigger) { | ||
| 2252 | ret = cft->trigger(css, (unsigned int)cft->private); | ||
| 2253 | } else { | 2769 | } else { |
| 2254 | ret = -EINVAL; | 2770 | ret = -EINVAL; |
| 2255 | } | 2771 | } |
| @@ -2326,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | |||
| 2326 | return -EPERM; | 2842 | return -EPERM; |
| 2327 | 2843 | ||
| 2328 | /* | 2844 | /* |
| 2329 | * We're gonna grab cgroup_tree_mutex which nests outside kernfs | 2845 | * We're gonna grab cgroup_mutex which nests outside kernfs |
| 2330 | * active_ref. kernfs_rename() doesn't require active_ref | 2846 | * active_ref. kernfs_rename() doesn't require active_ref |
| 2331 | * protection. Break them before grabbing cgroup_tree_mutex. | 2847 | * protection. Break them before grabbing cgroup_mutex. |
| 2332 | */ | 2848 | */ |
| 2333 | kernfs_break_active_protection(new_parent); | 2849 | kernfs_break_active_protection(new_parent); |
| 2334 | kernfs_break_active_protection(kn); | 2850 | kernfs_break_active_protection(kn); |
| 2335 | 2851 | ||
| 2336 | mutex_lock(&cgroup_tree_mutex); | ||
| 2337 | mutex_lock(&cgroup_mutex); | 2852 | mutex_lock(&cgroup_mutex); |
| 2338 | 2853 | ||
| 2339 | ret = kernfs_rename(kn, new_parent, new_name_str); | 2854 | ret = kernfs_rename(kn, new_parent, new_name_str); |
| 2340 | 2855 | ||
| 2341 | mutex_unlock(&cgroup_mutex); | 2856 | mutex_unlock(&cgroup_mutex); |
| 2342 | mutex_unlock(&cgroup_tree_mutex); | ||
| 2343 | 2857 | ||
| 2344 | kernfs_unbreak_active_protection(kn); | 2858 | kernfs_unbreak_active_protection(kn); |
| 2345 | kernfs_unbreak_active_protection(new_parent); | 2859 | kernfs_unbreak_active_protection(new_parent); |
| @@ -2377,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) | |||
| 2377 | return PTR_ERR(kn); | 2891 | return PTR_ERR(kn); |
| 2378 | 2892 | ||
| 2379 | ret = cgroup_kn_set_ugid(kn); | 2893 | ret = cgroup_kn_set_ugid(kn); |
| 2380 | if (ret) | 2894 | if (ret) { |
| 2381 | kernfs_remove(kn); | 2895 | kernfs_remove(kn); |
| 2382 | return ret; | 2896 | return ret; |
| 2897 | } | ||
| 2898 | |||
| 2899 | if (cft->seq_show == cgroup_populated_show) | ||
| 2900 | cgrp->populated_kn = kn; | ||
| 2901 | return 0; | ||
| 2383 | } | 2902 | } |
| 2384 | 2903 | ||
| 2385 | /** | 2904 | /** |
| @@ -2399,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
| 2399 | struct cftype *cft; | 2918 | struct cftype *cft; |
| 2400 | int ret; | 2919 | int ret; |
| 2401 | 2920 | ||
| 2402 | lockdep_assert_held(&cgroup_tree_mutex); | 2921 | lockdep_assert_held(&cgroup_mutex); |
| 2403 | 2922 | ||
| 2404 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2923 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
| 2405 | /* does cft->flags tell us to skip this file on @cgrp? */ | 2924 | /* does cft->flags tell us to skip this file on @cgrp? */ |
| @@ -2407,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
| 2407 | continue; | 2926 | continue; |
| 2408 | if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) | 2927 | if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) |
| 2409 | continue; | 2928 | continue; |
| 2410 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | 2929 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) |
| 2411 | continue; | 2930 | continue; |
| 2412 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | 2931 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) |
| 2413 | continue; | 2932 | continue; |
| 2414 | 2933 | ||
| 2415 | if (is_add) { | 2934 | if (is_add) { |
| 2416 | ret = cgroup_add_file(cgrp, cft); | 2935 | ret = cgroup_add_file(cgrp, cft); |
| 2417 | if (ret) { | 2936 | if (ret) { |
| 2418 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", | 2937 | pr_warn("%s: failed to add %s, err=%d\n", |
| 2419 | cft->name, ret); | 2938 | __func__, cft->name, ret); |
| 2420 | return ret; | 2939 | return ret; |
| 2421 | } | 2940 | } |
| 2422 | } else { | 2941 | } else { |
| @@ -2434,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) | |||
| 2434 | struct cgroup_subsys_state *css; | 2953 | struct cgroup_subsys_state *css; |
| 2435 | int ret = 0; | 2954 | int ret = 0; |
| 2436 | 2955 | ||
| 2437 | lockdep_assert_held(&cgroup_tree_mutex); | 2956 | lockdep_assert_held(&cgroup_mutex); |
| 2438 | |||
| 2439 | /* don't bother if @ss isn't attached */ | ||
| 2440 | if (ss->root == &cgrp_dfl_root) | ||
| 2441 | return 0; | ||
| 2442 | 2957 | ||
| 2443 | /* add/rm files for all cgroups created before */ | 2958 | /* add/rm files for all cgroups created before */ |
| 2444 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { | 2959 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
| @@ -2506,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
| 2506 | 3021 | ||
| 2507 | static int cgroup_rm_cftypes_locked(struct cftype *cfts) | 3022 | static int cgroup_rm_cftypes_locked(struct cftype *cfts) |
| 2508 | { | 3023 | { |
| 2509 | lockdep_assert_held(&cgroup_tree_mutex); | 3024 | lockdep_assert_held(&cgroup_mutex); |
| 2510 | 3025 | ||
| 2511 | if (!cfts || !cfts[0].ss) | 3026 | if (!cfts || !cfts[0].ss) |
| 2512 | return -ENOENT; | 3027 | return -ENOENT; |
| @@ -2532,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) | |||
| 2532 | { | 3047 | { |
| 2533 | int ret; | 3048 | int ret; |
| 2534 | 3049 | ||
| 2535 | mutex_lock(&cgroup_tree_mutex); | 3050 | mutex_lock(&cgroup_mutex); |
| 2536 | ret = cgroup_rm_cftypes_locked(cfts); | 3051 | ret = cgroup_rm_cftypes_locked(cfts); |
| 2537 | mutex_unlock(&cgroup_tree_mutex); | 3052 | mutex_unlock(&cgroup_mutex); |
| 2538 | return ret; | 3053 | return ret; |
| 2539 | } | 3054 | } |
| 2540 | 3055 | ||
| @@ -2556,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
| 2556 | { | 3071 | { |
| 2557 | int ret; | 3072 | int ret; |
| 2558 | 3073 | ||
| 3074 | if (ss->disabled) | ||
| 3075 | return 0; | ||
| 3076 | |||
| 2559 | if (!cfts || cfts[0].name[0] == '\0') | 3077 | if (!cfts || cfts[0].name[0] == '\0') |
| 2560 | return 0; | 3078 | return 0; |
| 2561 | 3079 | ||
| @@ -2563,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
| 2563 | if (ret) | 3081 | if (ret) |
| 2564 | return ret; | 3082 | return ret; |
| 2565 | 3083 | ||
| 2566 | mutex_lock(&cgroup_tree_mutex); | 3084 | mutex_lock(&cgroup_mutex); |
| 2567 | 3085 | ||
| 2568 | list_add_tail(&cfts->node, &ss->cfts); | 3086 | list_add_tail(&cfts->node, &ss->cfts); |
| 2569 | ret = cgroup_apply_cftypes(cfts, true); | 3087 | ret = cgroup_apply_cftypes(cfts, true); |
| 2570 | if (ret) | 3088 | if (ret) |
| 2571 | cgroup_rm_cftypes_locked(cfts); | 3089 | cgroup_rm_cftypes_locked(cfts); |
| 2572 | 3090 | ||
| 2573 | mutex_unlock(&cgroup_tree_mutex); | 3091 | mutex_unlock(&cgroup_mutex); |
| 2574 | return ret; | 3092 | return ret; |
| 2575 | } | 3093 | } |
| 2576 | 3094 | ||
| @@ -2594,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp) | |||
| 2594 | 3112 | ||
| 2595 | /** | 3113 | /** |
| 2596 | * css_next_child - find the next child of a given css | 3114 | * css_next_child - find the next child of a given css |
| 2597 | * @pos_css: the current position (%NULL to initiate traversal) | 3115 | * @pos: the current position (%NULL to initiate traversal) |
| 2598 | * @parent_css: css whose children to walk | 3116 | * @parent: css whose children to walk |
| 2599 | * | 3117 | * |
| 2600 | * This function returns the next child of @parent_css and should be called | 3118 | * This function returns the next child of @parent and should be called |
| 2601 | * under either cgroup_mutex or RCU read lock. The only requirement is | 3119 | * under either cgroup_mutex or RCU read lock. The only requirement is |
| 2602 | * that @parent_css and @pos_css are accessible. The next sibling is | 3120 | * that @parent and @pos are accessible. The next sibling is guaranteed to |
| 2603 | * guaranteed to be returned regardless of their states. | 3121 | * be returned regardless of their states. |
| 3122 | * | ||
| 3123 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
| 3124 | * css which finished ->css_online() is guaranteed to be visible in the | ||
| 3125 | * future iterations and will stay visible until the last reference is put. | ||
| 3126 | * A css which hasn't finished ->css_online() or already finished | ||
| 3127 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
| 3128 | * responsibility to synchronize against on/offlining. | ||
| 2604 | */ | 3129 | */ |
| 2605 | struct cgroup_subsys_state * | 3130 | struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, |
| 2606 | css_next_child(struct cgroup_subsys_state *pos_css, | 3131 | struct cgroup_subsys_state *parent) |
| 2607 | struct cgroup_subsys_state *parent_css) | ||
| 2608 | { | 3132 | { |
| 2609 | struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; | 3133 | struct cgroup_subsys_state *next; |
| 2610 | struct cgroup *cgrp = parent_css->cgroup; | ||
| 2611 | struct cgroup *next; | ||
| 2612 | 3134 | ||
| 2613 | cgroup_assert_mutexes_or_rcu_locked(); | 3135 | cgroup_assert_mutex_or_rcu_locked(); |
| 2614 | 3136 | ||
| 2615 | /* | 3137 | /* |
| 2616 | * @pos could already have been removed. Once a cgroup is removed, | 3138 | * @pos could already have been unlinked from the sibling list. |
| 2617 | * its ->sibling.next is no longer updated when its next sibling | 3139 | * Once a cgroup is removed, its ->sibling.next is no longer |
| 2618 | * changes. As CGRP_DEAD assertion is serialized and happens | 3140 | * updated when its next sibling changes. CSS_RELEASED is set when |
| 2619 | * before the cgroup is taken off the ->sibling list, if we see it | 3141 | * @pos is taken off list, at which time its next pointer is valid, |
| 2620 | * unasserted, it's guaranteed that the next sibling hasn't | 3142 | * and, as releases are serialized, the one pointed to by the next |
| 2621 | * finished its grace period even if it's already removed, and thus | 3143 | * pointer is guaranteed to not have started release yet. This |
| 2622 | * safe to dereference from this RCU critical section. If | 3144 | * implies that if we observe !CSS_RELEASED on @pos in this RCU |
| 2623 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | 3145 | * critical section, the one pointed to by its next pointer is |
| 2624 | * to be visible as %true here. | 3146 | * guaranteed to not have finished its RCU grace period even if we |
| 3147 | * have dropped rcu_read_lock() inbetween iterations. | ||
| 2625 | * | 3148 | * |
| 2626 | * If @pos is dead, its next pointer can't be dereferenced; | 3149 | * If @pos has CSS_RELEASED set, its next pointer can't be |
| 2627 | * however, as each cgroup is given a monotonically increasing | 3150 | * dereferenced; however, as each css is given a monotonically |
| 2628 | * unique serial number and always appended to the sibling list, | 3151 | * increasing unique serial number and always appended to the |
| 2629 | * the next one can be found by walking the parent's children until | 3152 | * sibling list, the next one can be found by walking the parent's |
| 2630 | * we see a cgroup with higher serial number than @pos's. While | 3153 | * children until the first css with higher serial number than |
| 2631 | * this path can be slower, it's taken only when either the current | 3154 | * @pos's. While this path can be slower, it happens iff iteration |
| 2632 | * cgroup is removed or iteration and removal race. | 3155 | * races against release and the race window is very small. |
| 2633 | */ | 3156 | */ |
| 2634 | if (!pos) { | 3157 | if (!pos) { |
| 2635 | next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); | 3158 | next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); |
| 2636 | } else if (likely(!cgroup_is_dead(pos))) { | 3159 | } else if (likely(!(pos->flags & CSS_RELEASED))) { |
| 2637 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3160 | next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); |
| 2638 | } else { | 3161 | } else { |
| 2639 | list_for_each_entry_rcu(next, &cgrp->children, sibling) | 3162 | list_for_each_entry_rcu(next, &parent->children, sibling) |
| 2640 | if (next->serial_nr > pos->serial_nr) | 3163 | if (next->serial_nr > pos->serial_nr) |
| 2641 | break; | 3164 | break; |
| 2642 | } | 3165 | } |
| 2643 | 3166 | ||
| 2644 | if (&next->sibling == &cgrp->children) | 3167 | /* |
| 2645 | return NULL; | 3168 | * @next, if not pointing to the head, can be dereferenced and is |
| 2646 | 3169 | * the next sibling. | |
| 2647 | return cgroup_css(next, parent_css->ss); | 3170 | */ |
| 3171 | if (&next->sibling != &parent->children) | ||
| 3172 | return next; | ||
| 3173 | return NULL; | ||
| 2648 | } | 3174 | } |
| 2649 | 3175 | ||
| 2650 | /** | 3176 | /** |
| @@ -2660,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css, | |||
| 2660 | * doesn't require the whole traversal to be contained in a single critical | 3186 | * doesn't require the whole traversal to be contained in a single critical |
| 2661 | * section. This function will return the correct next descendant as long | 3187 | * section. This function will return the correct next descendant as long |
| 2662 | * as both @pos and @root are accessible and @pos is a descendant of @root. | 3188 | * as both @pos and @root are accessible and @pos is a descendant of @root. |
| 3189 | * | ||
| 3190 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
| 3191 | * css which finished ->css_online() is guaranteed to be visible in the | ||
| 3192 | * future iterations and will stay visible until the last reference is put. | ||
| 3193 | * A css which hasn't finished ->css_online() or already finished | ||
| 3194 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
| 3195 | * responsibility to synchronize against on/offlining. | ||
| 2663 | */ | 3196 | */ |
| 2664 | struct cgroup_subsys_state * | 3197 | struct cgroup_subsys_state * |
| 2665 | css_next_descendant_pre(struct cgroup_subsys_state *pos, | 3198 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
| @@ -2667,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
| 2667 | { | 3200 | { |
| 2668 | struct cgroup_subsys_state *next; | 3201 | struct cgroup_subsys_state *next; |
| 2669 | 3202 | ||
| 2670 | cgroup_assert_mutexes_or_rcu_locked(); | 3203 | cgroup_assert_mutex_or_rcu_locked(); |
| 2671 | 3204 | ||
| 2672 | /* if first iteration, visit @root */ | 3205 | /* if first iteration, visit @root */ |
| 2673 | if (!pos) | 3206 | if (!pos) |
| @@ -2680,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
| 2680 | 3213 | ||
| 2681 | /* no child, visit my or the closest ancestor's next sibling */ | 3214 | /* no child, visit my or the closest ancestor's next sibling */ |
| 2682 | while (pos != root) { | 3215 | while (pos != root) { |
| 2683 | next = css_next_child(pos, css_parent(pos)); | 3216 | next = css_next_child(pos, pos->parent); |
| 2684 | if (next) | 3217 | if (next) |
| 2685 | return next; | 3218 | return next; |
| 2686 | pos = css_parent(pos); | 3219 | pos = pos->parent; |
| 2687 | } | 3220 | } |
| 2688 | 3221 | ||
| 2689 | return NULL; | 3222 | return NULL; |
| @@ -2707,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) | |||
| 2707 | { | 3240 | { |
| 2708 | struct cgroup_subsys_state *last, *tmp; | 3241 | struct cgroup_subsys_state *last, *tmp; |
| 2709 | 3242 | ||
| 2710 | cgroup_assert_mutexes_or_rcu_locked(); | 3243 | cgroup_assert_mutex_or_rcu_locked(); |
| 2711 | 3244 | ||
| 2712 | do { | 3245 | do { |
| 2713 | last = pos; | 3246 | last = pos; |
| @@ -2747,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) | |||
| 2747 | * section. This function will return the correct next descendant as long | 3280 | * section. This function will return the correct next descendant as long |
| 2748 | * as both @pos and @cgroup are accessible and @pos is a descendant of | 3281 | * as both @pos and @cgroup are accessible and @pos is a descendant of |
| 2749 | * @cgroup. | 3282 | * @cgroup. |
| 3283 | * | ||
| 3284 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
| 3285 | * css which finished ->css_online() is guaranteed to be visible in the | ||
| 3286 | * future iterations and will stay visible until the last reference is put. | ||
| 3287 | * A css which hasn't finished ->css_online() or already finished | ||
| 3288 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
| 3289 | * responsibility to synchronize against on/offlining. | ||
| 2750 | */ | 3290 | */ |
| 2751 | struct cgroup_subsys_state * | 3291 | struct cgroup_subsys_state * |
| 2752 | css_next_descendant_post(struct cgroup_subsys_state *pos, | 3292 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
| @@ -2754,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
| 2754 | { | 3294 | { |
| 2755 | struct cgroup_subsys_state *next; | 3295 | struct cgroup_subsys_state *next; |
| 2756 | 3296 | ||
| 2757 | cgroup_assert_mutexes_or_rcu_locked(); | 3297 | cgroup_assert_mutex_or_rcu_locked(); |
| 2758 | 3298 | ||
| 2759 | /* if first iteration, visit leftmost descendant which may be @root */ | 3299 | /* if first iteration, visit leftmost descendant which may be @root */ |
| 2760 | if (!pos) | 3300 | if (!pos) |
| @@ -2765,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
| 2765 | return NULL; | 3305 | return NULL; |
| 2766 | 3306 | ||
| 2767 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3307 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
| 2768 | next = css_next_child(pos, css_parent(pos)); | 3308 | next = css_next_child(pos, pos->parent); |
| 2769 | if (next) | 3309 | if (next) |
| 2770 | return css_leftmost_descendant(next); | 3310 | return css_leftmost_descendant(next); |
| 2771 | 3311 | ||
| 2772 | /* no sibling left, visit parent */ | 3312 | /* no sibling left, visit parent */ |
| 2773 | return css_parent(pos); | 3313 | return pos->parent; |
| 3314 | } | ||
| 3315 | |||
| 3316 | /** | ||
| 3317 | * css_has_online_children - does a css have online children | ||
| 3318 | * @css: the target css | ||
| 3319 | * | ||
| 3320 | * Returns %true if @css has any online children; otherwise, %false. This | ||
| 3321 | * function can be called from any context but the caller is responsible | ||
| 3322 | * for synchronizing against on/offlining as necessary. | ||
| 3323 | */ | ||
| 3324 | bool css_has_online_children(struct cgroup_subsys_state *css) | ||
| 3325 | { | ||
| 3326 | struct cgroup_subsys_state *child; | ||
| 3327 | bool ret = false; | ||
| 3328 | |||
| 3329 | rcu_read_lock(); | ||
| 3330 | css_for_each_child(child, css) { | ||
| 3331 | if (css->flags & CSS_ONLINE) { | ||
| 3332 | ret = true; | ||
| 3333 | break; | ||
| 3334 | } | ||
| 3335 | } | ||
| 3336 | rcu_read_unlock(); | ||
| 3337 | return ret; | ||
| 2774 | } | 3338 | } |
| 2775 | 3339 | ||
| 2776 | /** | 3340 | /** |
| @@ -2781,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
| 2781 | */ | 3345 | */ |
| 2782 | static void css_advance_task_iter(struct css_task_iter *it) | 3346 | static void css_advance_task_iter(struct css_task_iter *it) |
| 2783 | { | 3347 | { |
| 2784 | struct list_head *l = it->cset_link; | 3348 | struct list_head *l = it->cset_pos; |
| 2785 | struct cgrp_cset_link *link; | 3349 | struct cgrp_cset_link *link; |
| 2786 | struct css_set *cset; | 3350 | struct css_set *cset; |
| 2787 | 3351 | ||
| 2788 | /* Advance to the next non-empty css_set */ | 3352 | /* Advance to the next non-empty css_set */ |
| 2789 | do { | 3353 | do { |
| 2790 | l = l->next; | 3354 | l = l->next; |
| 2791 | if (l == &it->origin_css->cgroup->cset_links) { | 3355 | if (l == it->cset_head) { |
| 2792 | it->cset_link = NULL; | 3356 | it->cset_pos = NULL; |
| 2793 | return; | 3357 | return; |
| 2794 | } | 3358 | } |
| 2795 | link = list_entry(l, struct cgrp_cset_link, cset_link); | 3359 | |
| 2796 | cset = link->cset; | 3360 | if (it->ss) { |
| 3361 | cset = container_of(l, struct css_set, | ||
| 3362 | e_cset_node[it->ss->id]); | ||
| 3363 | } else { | ||
| 3364 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
| 3365 | cset = link->cset; | ||
| 3366 | } | ||
| 2797 | } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); | 3367 | } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); |
| 2798 | 3368 | ||
| 2799 | it->cset_link = l; | 3369 | it->cset_pos = l; |
| 2800 | 3370 | ||
| 2801 | if (!list_empty(&cset->tasks)) | 3371 | if (!list_empty(&cset->tasks)) |
| 2802 | it->task = cset->tasks.next; | 3372 | it->task_pos = cset->tasks.next; |
| 2803 | else | 3373 | else |
| 2804 | it->task = cset->mg_tasks.next; | 3374 | it->task_pos = cset->mg_tasks.next; |
| 3375 | |||
| 3376 | it->tasks_head = &cset->tasks; | ||
| 3377 | it->mg_tasks_head = &cset->mg_tasks; | ||
| 2805 | } | 3378 | } |
| 2806 | 3379 | ||
| 2807 | /** | 3380 | /** |
| @@ -2827,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
| 2827 | 3400 | ||
| 2828 | down_read(&css_set_rwsem); | 3401 | down_read(&css_set_rwsem); |
| 2829 | 3402 | ||
| 2830 | it->origin_css = css; | 3403 | it->ss = css->ss; |
| 2831 | it->cset_link = &css->cgroup->cset_links; | 3404 | |
| 3405 | if (it->ss) | ||
| 3406 | it->cset_pos = &css->cgroup->e_csets[css->ss->id]; | ||
| 3407 | else | ||
| 3408 | it->cset_pos = &css->cgroup->cset_links; | ||
| 3409 | |||
| 3410 | it->cset_head = it->cset_pos; | ||
| 2832 | 3411 | ||
| 2833 | css_advance_task_iter(it); | 3412 | css_advance_task_iter(it); |
| 2834 | } | 3413 | } |
| @@ -2844,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
| 2844 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | 3423 | struct task_struct *css_task_iter_next(struct css_task_iter *it) |
| 2845 | { | 3424 | { |
| 2846 | struct task_struct *res; | 3425 | struct task_struct *res; |
| 2847 | struct list_head *l = it->task; | 3426 | struct list_head *l = it->task_pos; |
| 2848 | struct cgrp_cset_link *link = list_entry(it->cset_link, | ||
| 2849 | struct cgrp_cset_link, cset_link); | ||
| 2850 | 3427 | ||
| 2851 | /* If the iterator cg is NULL, we have no tasks */ | 3428 | /* If the iterator cg is NULL, we have no tasks */ |
| 2852 | if (!it->cset_link) | 3429 | if (!it->cset_pos) |
| 2853 | return NULL; | 3430 | return NULL; |
| 2854 | res = list_entry(l, struct task_struct, cg_list); | 3431 | res = list_entry(l, struct task_struct, cg_list); |
| 2855 | 3432 | ||
| @@ -2860,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
| 2860 | */ | 3437 | */ |
| 2861 | l = l->next; | 3438 | l = l->next; |
| 2862 | 3439 | ||
| 2863 | if (l == &link->cset->tasks) | 3440 | if (l == it->tasks_head) |
| 2864 | l = link->cset->mg_tasks.next; | 3441 | l = it->mg_tasks_head->next; |
| 2865 | 3442 | ||
| 2866 | if (l == &link->cset->mg_tasks) | 3443 | if (l == it->mg_tasks_head) |
| 2867 | css_advance_task_iter(it); | 3444 | css_advance_task_iter(it); |
| 2868 | else | 3445 | else |
| 2869 | it->task = l; | 3446 | it->task_pos = l; |
| 2870 | 3447 | ||
| 2871 | return res; | 3448 | return res; |
| 2872 | } | 3449 | } |
| @@ -2919,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
| 2919 | * ->can_attach() fails. | 3496 | * ->can_attach() fails. |
| 2920 | */ | 3497 | */ |
| 2921 | do { | 3498 | do { |
| 2922 | css_task_iter_start(&from->dummy_css, &it); | 3499 | css_task_iter_start(&from->self, &it); |
| 2923 | task = css_task_iter_next(&it); | 3500 | task = css_task_iter_next(&it); |
| 2924 | if (task) | 3501 | if (task) |
| 2925 | get_task_struct(task); | 3502 | get_task_struct(task); |
| @@ -3184,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
| 3184 | if (!array) | 3761 | if (!array) |
| 3185 | return -ENOMEM; | 3762 | return -ENOMEM; |
| 3186 | /* now, populate the array */ | 3763 | /* now, populate the array */ |
| 3187 | css_task_iter_start(&cgrp->dummy_css, &it); | 3764 | css_task_iter_start(&cgrp->self, &it); |
| 3188 | while ((tsk = css_task_iter_next(&it))) { | 3765 | while ((tsk = css_task_iter_next(&it))) { |
| 3189 | if (unlikely(n == length)) | 3766 | if (unlikely(n == length)) |
| 3190 | break; | 3767 | break; |
| @@ -3246,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
| 3246 | 3823 | ||
| 3247 | /* | 3824 | /* |
| 3248 | * We aren't being called from kernfs and there's no guarantee on | 3825 | * We aren't being called from kernfs and there's no guarantee on |
| 3249 | * @kn->priv's validity. For this and css_tryget_from_dir(), | 3826 | * @kn->priv's validity. For this and css_tryget_online_from_dir(), |
| 3250 | * @kn->priv is RCU safe. Let's do the RCU dancing. | 3827 | * @kn->priv is RCU safe. Let's do the RCU dancing. |
| 3251 | */ | 3828 | */ |
| 3252 | rcu_read_lock(); | 3829 | rcu_read_lock(); |
| @@ -3258,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
| 3258 | } | 3835 | } |
| 3259 | rcu_read_unlock(); | 3836 | rcu_read_unlock(); |
| 3260 | 3837 | ||
| 3261 | css_task_iter_start(&cgrp->dummy_css, &it); | 3838 | css_task_iter_start(&cgrp->self, &it); |
| 3262 | while ((tsk = css_task_iter_next(&it))) { | 3839 | while ((tsk = css_task_iter_next(&it))) { |
| 3263 | switch (tsk->state) { | 3840 | switch (tsk->state) { |
| 3264 | case TASK_RUNNING: | 3841 | case TASK_RUNNING: |
| @@ -3388,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) | |||
| 3388 | return seq_printf(s, "%d\n", *(int *)v); | 3965 | return seq_printf(s, "%d\n", *(int *)v); |
| 3389 | } | 3966 | } |
| 3390 | 3967 | ||
| 3391 | /* | ||
| 3392 | * seq_operations functions for iterating on pidlists through seq_file - | ||
| 3393 | * independent of whether it's tasks or procs | ||
| 3394 | */ | ||
| 3395 | static const struct seq_operations cgroup_pidlist_seq_operations = { | ||
| 3396 | .start = cgroup_pidlist_start, | ||
| 3397 | .stop = cgroup_pidlist_stop, | ||
| 3398 | .next = cgroup_pidlist_next, | ||
| 3399 | .show = cgroup_pidlist_show, | ||
| 3400 | }; | ||
| 3401 | |||
| 3402 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | 3968 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
| 3403 | struct cftype *cft) | 3969 | struct cftype *cft) |
| 3404 | { | 3970 | { |
| @@ -3440,7 +4006,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3440 | .seq_stop = cgroup_pidlist_stop, | 4006 | .seq_stop = cgroup_pidlist_stop, |
| 3441 | .seq_show = cgroup_pidlist_show, | 4007 | .seq_show = cgroup_pidlist_show, |
| 3442 | .private = CGROUP_FILE_PROCS, | 4008 | .private = CGROUP_FILE_PROCS, |
| 3443 | .write_u64 = cgroup_procs_write, | 4009 | .write = cgroup_procs_write, |
| 3444 | .mode = S_IRUGO | S_IWUSR, | 4010 | .mode = S_IRUGO | S_IWUSR, |
| 3445 | }, | 4011 | }, |
| 3446 | { | 4012 | { |
| @@ -3454,6 +4020,27 @@ static struct cftype cgroup_base_files[] = { | |||
| 3454 | .flags = CFTYPE_ONLY_ON_ROOT, | 4020 | .flags = CFTYPE_ONLY_ON_ROOT, |
| 3455 | .seq_show = cgroup_sane_behavior_show, | 4021 | .seq_show = cgroup_sane_behavior_show, |
| 3456 | }, | 4022 | }, |
| 4023 | { | ||
| 4024 | .name = "cgroup.controllers", | ||
| 4025 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, | ||
| 4026 | .seq_show = cgroup_root_controllers_show, | ||
| 4027 | }, | ||
| 4028 | { | ||
| 4029 | .name = "cgroup.controllers", | ||
| 4030 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, | ||
| 4031 | .seq_show = cgroup_controllers_show, | ||
| 4032 | }, | ||
| 4033 | { | ||
| 4034 | .name = "cgroup.subtree_control", | ||
| 4035 | .flags = CFTYPE_ONLY_ON_DFL, | ||
| 4036 | .seq_show = cgroup_subtree_control_show, | ||
| 4037 | .write = cgroup_subtree_control_write, | ||
| 4038 | }, | ||
| 4039 | { | ||
| 4040 | .name = "cgroup.populated", | ||
| 4041 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, | ||
| 4042 | .seq_show = cgroup_populated_show, | ||
| 4043 | }, | ||
| 3457 | 4044 | ||
| 3458 | /* | 4045 | /* |
| 3459 | * Historical crazy stuff. These don't have "cgroup." prefix and | 4046 | * Historical crazy stuff. These don't have "cgroup." prefix and |
| @@ -3468,7 +4055,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3468 | .seq_stop = cgroup_pidlist_stop, | 4055 | .seq_stop = cgroup_pidlist_stop, |
| 3469 | .seq_show = cgroup_pidlist_show, | 4056 | .seq_show = cgroup_pidlist_show, |
| 3470 | .private = CGROUP_FILE_TASKS, | 4057 | .private = CGROUP_FILE_TASKS, |
| 3471 | .write_u64 = cgroup_tasks_write, | 4058 | .write = cgroup_tasks_write, |
| 3472 | .mode = S_IRUGO | S_IWUSR, | 4059 | .mode = S_IRUGO | S_IWUSR, |
| 3473 | }, | 4060 | }, |
| 3474 | { | 4061 | { |
| @@ -3481,7 +4068,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3481 | .name = "release_agent", | 4068 | .name = "release_agent", |
| 3482 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, | 4069 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, |
| 3483 | .seq_show = cgroup_release_agent_show, | 4070 | .seq_show = cgroup_release_agent_show, |
| 3484 | .write_string = cgroup_release_agent_write, | 4071 | .write = cgroup_release_agent_write, |
| 3485 | .max_write_len = PATH_MAX - 1, | 4072 | .max_write_len = PATH_MAX - 1, |
| 3486 | }, | 4073 | }, |
| 3487 | { } /* terminate */ | 4074 | { } /* terminate */ |
| @@ -3494,7 +4081,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3494 | * | 4081 | * |
| 3495 | * On failure, no file is added. | 4082 | * On failure, no file is added. |
| 3496 | */ | 4083 | */ |
| 3497 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | 4084 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) |
| 3498 | { | 4085 | { |
| 3499 | struct cgroup_subsys *ss; | 4086 | struct cgroup_subsys *ss; |
| 3500 | int i, ret = 0; | 4087 | int i, ret = 0; |
| @@ -3503,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | |||
| 3503 | for_each_subsys(ss, i) { | 4090 | for_each_subsys(ss, i) { |
| 3504 | struct cftype *cfts; | 4091 | struct cftype *cfts; |
| 3505 | 4092 | ||
| 3506 | if (!test_bit(i, &subsys_mask)) | 4093 | if (!(subsys_mask & (1 << i))) |
| 3507 | continue; | 4094 | continue; |
| 3508 | 4095 | ||
| 3509 | list_for_each_entry(cfts, &ss->cfts, node) { | 4096 | list_for_each_entry(cfts, &ss->cfts, node) { |
| @@ -3525,9 +4112,9 @@ err: | |||
| 3525 | * Implemented in kill_css(). | 4112 | * Implemented in kill_css(). |
| 3526 | * | 4113 | * |
| 3527 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs | 4114 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs |
| 3528 | * and thus css_tryget() is guaranteed to fail, the css can be offlined | 4115 | * and thus css_tryget_online() is guaranteed to fail, the css can be |
| 3529 | * by invoking offline_css(). After offlining, the base ref is put. | 4116 | * offlined by invoking offline_css(). After offlining, the base ref is |
| 3530 | * Implemented in css_killed_work_fn(). | 4117 | * put. Implemented in css_killed_work_fn(). |
| 3531 | * | 4118 | * |
| 3532 | * 3. When the percpu_ref reaches zero, the only possible remaining | 4119 | * 3. When the percpu_ref reaches zero, the only possible remaining |
| 3533 | * accessors are inside RCU read sections. css_release() schedules the | 4120 | * accessors are inside RCU read sections. css_release() schedules the |
| @@ -3546,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work) | |||
| 3546 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4133 | container_of(work, struct cgroup_subsys_state, destroy_work); |
| 3547 | struct cgroup *cgrp = css->cgroup; | 4134 | struct cgroup *cgrp = css->cgroup; |
| 3548 | 4135 | ||
| 3549 | if (css->parent) | 4136 | if (css->ss) { |
| 3550 | css_put(css->parent); | 4137 | /* css free path */ |
| 4138 | if (css->parent) | ||
| 4139 | css_put(css->parent); | ||
| 3551 | 4140 | ||
| 3552 | css->ss->css_free(css); | 4141 | css->ss->css_free(css); |
| 3553 | cgroup_put(cgrp); | 4142 | cgroup_put(cgrp); |
| 4143 | } else { | ||
| 4144 | /* cgroup free path */ | ||
| 4145 | atomic_dec(&cgrp->root->nr_cgrps); | ||
| 4146 | cgroup_pidlist_destroy_all(cgrp); | ||
| 4147 | |||
| 4148 | if (cgroup_parent(cgrp)) { | ||
| 4149 | /* | ||
| 4150 | * We get a ref to the parent, and put the ref when | ||
| 4151 | * this cgroup is being freed, so it's guaranteed | ||
| 4152 | * that the parent won't be destroyed before its | ||
| 4153 | * children. | ||
| 4154 | */ | ||
| 4155 | cgroup_put(cgroup_parent(cgrp)); | ||
| 4156 | kernfs_put(cgrp->kn); | ||
| 4157 | kfree(cgrp); | ||
| 4158 | } else { | ||
| 4159 | /* | ||
| 4160 | * This is root cgroup's refcnt reaching zero, | ||
| 4161 | * which indicates that the root should be | ||
| 4162 | * released. | ||
| 4163 | */ | ||
| 4164 | cgroup_destroy_root(cgrp->root); | ||
| 4165 | } | ||
| 4166 | } | ||
| 3554 | } | 4167 | } |
| 3555 | 4168 | ||
| 3556 | static void css_free_rcu_fn(struct rcu_head *rcu_head) | 4169 | static void css_free_rcu_fn(struct rcu_head *rcu_head) |
| @@ -3562,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) | |||
| 3562 | queue_work(cgroup_destroy_wq, &css->destroy_work); | 4175 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
| 3563 | } | 4176 | } |
| 3564 | 4177 | ||
| 4178 | static void css_release_work_fn(struct work_struct *work) | ||
| 4179 | { | ||
| 4180 | struct cgroup_subsys_state *css = | ||
| 4181 | container_of(work, struct cgroup_subsys_state, destroy_work); | ||
| 4182 | struct cgroup_subsys *ss = css->ss; | ||
| 4183 | struct cgroup *cgrp = css->cgroup; | ||
| 4184 | |||
| 4185 | mutex_lock(&cgroup_mutex); | ||
| 4186 | |||
| 4187 | css->flags |= CSS_RELEASED; | ||
| 4188 | list_del_rcu(&css->sibling); | ||
| 4189 | |||
| 4190 | if (ss) { | ||
| 4191 | /* css release path */ | ||
| 4192 | cgroup_idr_remove(&ss->css_idr, css->id); | ||
| 4193 | } else { | ||
| 4194 | /* cgroup release path */ | ||
| 4195 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
| 4196 | cgrp->id = -1; | ||
| 4197 | } | ||
| 4198 | |||
| 4199 | mutex_unlock(&cgroup_mutex); | ||
| 4200 | |||
| 4201 | call_rcu(&css->rcu_head, css_free_rcu_fn); | ||
| 4202 | } | ||
| 4203 | |||
| 3565 | static void css_release(struct percpu_ref *ref) | 4204 | static void css_release(struct percpu_ref *ref) |
| 3566 | { | 4205 | { |
| 3567 | struct cgroup_subsys_state *css = | 4206 | struct cgroup_subsys_state *css = |
| 3568 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4207 | container_of(ref, struct cgroup_subsys_state, refcnt); |
| 3569 | 4208 | ||
| 3570 | RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); | 4209 | INIT_WORK(&css->destroy_work, css_release_work_fn); |
| 3571 | call_rcu(&css->rcu_head, css_free_rcu_fn); | 4210 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
| 3572 | } | 4211 | } |
| 3573 | 4212 | ||
| 3574 | static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, | 4213 | static void init_and_link_css(struct cgroup_subsys_state *css, |
| 3575 | struct cgroup *cgrp) | 4214 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 3576 | { | 4215 | { |
| 4216 | lockdep_assert_held(&cgroup_mutex); | ||
| 4217 | |||
| 4218 | cgroup_get(cgrp); | ||
| 4219 | |||
| 4220 | memset(css, 0, sizeof(*css)); | ||
| 3577 | css->cgroup = cgrp; | 4221 | css->cgroup = cgrp; |
| 3578 | css->ss = ss; | 4222 | css->ss = ss; |
| 3579 | css->flags = 0; | 4223 | INIT_LIST_HEAD(&css->sibling); |
| 4224 | INIT_LIST_HEAD(&css->children); | ||
| 4225 | css->serial_nr = css_serial_nr_next++; | ||
| 3580 | 4226 | ||
| 3581 | if (cgrp->parent) | 4227 | if (cgroup_parent(cgrp)) { |
| 3582 | css->parent = cgroup_css(cgrp->parent, ss); | 4228 | css->parent = cgroup_css(cgroup_parent(cgrp), ss); |
| 3583 | else | 4229 | css_get(css->parent); |
| 3584 | css->flags |= CSS_ROOT; | 4230 | } |
| 3585 | 4231 | ||
| 3586 | BUG_ON(cgroup_css(cgrp, ss)); | 4232 | BUG_ON(cgroup_css(cgrp, ss)); |
| 3587 | } | 4233 | } |
| @@ -3592,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css) | |||
| 3592 | struct cgroup_subsys *ss = css->ss; | 4238 | struct cgroup_subsys *ss = css->ss; |
| 3593 | int ret = 0; | 4239 | int ret = 0; |
| 3594 | 4240 | ||
| 3595 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 3596 | lockdep_assert_held(&cgroup_mutex); | 4241 | lockdep_assert_held(&cgroup_mutex); |
| 3597 | 4242 | ||
| 3598 | if (ss->css_online) | 4243 | if (ss->css_online) |
| 3599 | ret = ss->css_online(css); | 4244 | ret = ss->css_online(css); |
| 3600 | if (!ret) { | 4245 | if (!ret) { |
| 3601 | css->flags |= CSS_ONLINE; | 4246 | css->flags |= CSS_ONLINE; |
| 3602 | css->cgroup->nr_css++; | ||
| 3603 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); | 4247 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); |
| 3604 | } | 4248 | } |
| 3605 | return ret; | 4249 | return ret; |
| @@ -3610,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
| 3610 | { | 4254 | { |
| 3611 | struct cgroup_subsys *ss = css->ss; | 4255 | struct cgroup_subsys *ss = css->ss; |
| 3612 | 4256 | ||
| 3613 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 3614 | lockdep_assert_held(&cgroup_mutex); | 4257 | lockdep_assert_held(&cgroup_mutex); |
| 3615 | 4258 | ||
| 3616 | if (!(css->flags & CSS_ONLINE)) | 4259 | if (!(css->flags & CSS_ONLINE)) |
| @@ -3620,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
| 3620 | ss->css_offline(css); | 4263 | ss->css_offline(css); |
| 3621 | 4264 | ||
| 3622 | css->flags &= ~CSS_ONLINE; | 4265 | css->flags &= ~CSS_ONLINE; |
| 3623 | css->cgroup->nr_css--; | 4266 | RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); |
| 3624 | RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); | 4267 | |
| 4268 | wake_up_all(&css->cgroup->offline_waitq); | ||
| 3625 | } | 4269 | } |
| 3626 | 4270 | ||
| 3627 | /** | 4271 | /** |
| @@ -3635,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
| 3635 | */ | 4279 | */ |
| 3636 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) | 4280 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) |
| 3637 | { | 4281 | { |
| 3638 | struct cgroup *parent = cgrp->parent; | 4282 | struct cgroup *parent = cgroup_parent(cgrp); |
| 4283 | struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); | ||
| 3639 | struct cgroup_subsys_state *css; | 4284 | struct cgroup_subsys_state *css; |
| 3640 | int err; | 4285 | int err; |
| 3641 | 4286 | ||
| 3642 | lockdep_assert_held(&cgroup_mutex); | 4287 | lockdep_assert_held(&cgroup_mutex); |
| 3643 | 4288 | ||
| 3644 | css = ss->css_alloc(cgroup_css(parent, ss)); | 4289 | css = ss->css_alloc(parent_css); |
| 3645 | if (IS_ERR(css)) | 4290 | if (IS_ERR(css)) |
| 3646 | return PTR_ERR(css); | 4291 | return PTR_ERR(css); |
| 3647 | 4292 | ||
| 4293 | init_and_link_css(css, ss, cgrp); | ||
| 4294 | |||
| 3648 | err = percpu_ref_init(&css->refcnt, css_release); | 4295 | err = percpu_ref_init(&css->refcnt, css_release); |
| 3649 | if (err) | 4296 | if (err) |
| 3650 | goto err_free_css; | 4297 | goto err_free_css; |
| 3651 | 4298 | ||
| 3652 | init_css(css, ss, cgrp); | 4299 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); |
| 4300 | if (err < 0) | ||
| 4301 | goto err_free_percpu_ref; | ||
| 4302 | css->id = err; | ||
| 3653 | 4303 | ||
| 3654 | err = cgroup_populate_dir(cgrp, 1 << ss->id); | 4304 | err = cgroup_populate_dir(cgrp, 1 << ss->id); |
| 3655 | if (err) | 4305 | if (err) |
| 3656 | goto err_free_percpu_ref; | 4306 | goto err_free_id; |
| 4307 | |||
| 4308 | /* @css is ready to be brought online now, make it visible */ | ||
| 4309 | list_add_tail_rcu(&css->sibling, &parent_css->children); | ||
| 4310 | cgroup_idr_replace(&ss->css_idr, css, css->id); | ||
| 3657 | 4311 | ||
| 3658 | err = online_css(css); | 4312 | err = online_css(css); |
| 3659 | if (err) | 4313 | if (err) |
| 3660 | goto err_clear_dir; | 4314 | goto err_list_del; |
| 3661 | |||
| 3662 | cgroup_get(cgrp); | ||
| 3663 | css_get(css->parent); | ||
| 3664 | |||
| 3665 | cgrp->subsys_mask |= 1 << ss->id; | ||
| 3666 | 4315 | ||
| 3667 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4316 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
| 3668 | parent->parent) { | 4317 | cgroup_parent(parent)) { |
| 3669 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | 4318 | pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", |
| 3670 | current->comm, current->pid, ss->name); | 4319 | current->comm, current->pid, ss->name); |
| 3671 | if (!strcmp(ss->name, "memory")) | 4320 | if (!strcmp(ss->name, "memory")) |
| 3672 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | 4321 | pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); |
| 3673 | ss->warned_broken_hierarchy = true; | 4322 | ss->warned_broken_hierarchy = true; |
| 3674 | } | 4323 | } |
| 3675 | 4324 | ||
| 3676 | return 0; | 4325 | return 0; |
| 3677 | 4326 | ||
| 3678 | err_clear_dir: | 4327 | err_list_del: |
| 4328 | list_del_rcu(&css->sibling); | ||
| 3679 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); | 4329 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); |
| 4330 | err_free_id: | ||
| 4331 | cgroup_idr_remove(&ss->css_idr, css->id); | ||
| 3680 | err_free_percpu_ref: | 4332 | err_free_percpu_ref: |
| 3681 | percpu_ref_cancel_init(&css->refcnt); | 4333 | percpu_ref_cancel_init(&css->refcnt); |
| 3682 | err_free_css: | 4334 | err_free_css: |
| 3683 | ss->css_free(css); | 4335 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
| 3684 | return err; | 4336 | return err; |
| 3685 | } | 4337 | } |
| 3686 | 4338 | ||
| 3687 | /** | 4339 | static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, |
| 3688 | * cgroup_create - create a cgroup | 4340 | umode_t mode) |
| 3689 | * @parent: cgroup that will be parent of the new cgroup | ||
| 3690 | * @name: name of the new cgroup | ||
| 3691 | * @mode: mode to set on new cgroup | ||
| 3692 | */ | ||
| 3693 | static long cgroup_create(struct cgroup *parent, const char *name, | ||
| 3694 | umode_t mode) | ||
| 3695 | { | 4341 | { |
| 3696 | struct cgroup *cgrp; | 4342 | struct cgroup *parent, *cgrp; |
| 3697 | struct cgroup_root *root = parent->root; | 4343 | struct cgroup_root *root; |
| 3698 | int ssid, err; | ||
| 3699 | struct cgroup_subsys *ss; | 4344 | struct cgroup_subsys *ss; |
| 3700 | struct kernfs_node *kn; | 4345 | struct kernfs_node *kn; |
| 4346 | int ssid, ret; | ||
| 3701 | 4347 | ||
| 3702 | /* | 4348 | parent = cgroup_kn_lock_live(parent_kn); |
| 3703 | * XXX: The default hierarchy isn't fully implemented yet. Block | 4349 | if (!parent) |
| 3704 | * !root cgroup creation on it for now. | 4350 | return -ENODEV; |
| 3705 | */ | 4351 | root = parent->root; |
| 3706 | if (root == &cgrp_dfl_root) | ||
| 3707 | return -EINVAL; | ||
| 3708 | 4352 | ||
| 3709 | /* allocate the cgroup and its ID, 0 is reserved for the root */ | 4353 | /* allocate the cgroup and its ID, 0 is reserved for the root */ |
| 3710 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 4354 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
| 3711 | if (!cgrp) | 4355 | if (!cgrp) { |
| 3712 | return -ENOMEM; | 4356 | ret = -ENOMEM; |
| 3713 | 4357 | goto out_unlock; | |
| 3714 | mutex_lock(&cgroup_tree_mutex); | ||
| 3715 | |||
| 3716 | /* | ||
| 3717 | * Only live parents can have children. Note that the liveliness | ||
| 3718 | * check isn't strictly necessary because cgroup_mkdir() and | ||
| 3719 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
| 3720 | * anyway so that locking is contained inside cgroup proper and we | ||
| 3721 | * don't get nasty surprises if we ever grow another caller. | ||
| 3722 | */ | ||
| 3723 | if (!cgroup_lock_live_group(parent)) { | ||
| 3724 | err = -ENODEV; | ||
| 3725 | goto err_unlock_tree; | ||
| 3726 | } | 4358 | } |
| 3727 | 4359 | ||
| 4360 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release); | ||
| 4361 | if (ret) | ||
| 4362 | goto out_free_cgrp; | ||
| 4363 | |||
| 3728 | /* | 4364 | /* |
| 3729 | * Temporarily set the pointer to NULL, so idr_find() won't return | 4365 | * Temporarily set the pointer to NULL, so idr_find() won't return |
| 3730 | * a half-baked cgroup. | 4366 | * a half-baked cgroup. |
| 3731 | */ | 4367 | */ |
| 3732 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | 4368 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); |
| 3733 | if (cgrp->id < 0) { | 4369 | if (cgrp->id < 0) { |
| 3734 | err = -ENOMEM; | 4370 | ret = -ENOMEM; |
| 3735 | goto err_unlock; | 4371 | goto out_cancel_ref; |
| 3736 | } | 4372 | } |
| 3737 | 4373 | ||
| 3738 | init_cgroup_housekeeping(cgrp); | 4374 | init_cgroup_housekeeping(cgrp); |
| 3739 | 4375 | ||
| 3740 | cgrp->parent = parent; | 4376 | cgrp->self.parent = &parent->self; |
| 3741 | cgrp->dummy_css.parent = &parent->dummy_css; | 4377 | cgrp->root = root; |
| 3742 | cgrp->root = parent->root; | ||
| 3743 | 4378 | ||
| 3744 | if (notify_on_release(parent)) | 4379 | if (notify_on_release(parent)) |
| 3745 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4380 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
| @@ -3750,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
| 3750 | /* create the directory */ | 4385 | /* create the directory */ |
| 3751 | kn = kernfs_create_dir(parent->kn, name, mode, cgrp); | 4386 | kn = kernfs_create_dir(parent->kn, name, mode, cgrp); |
| 3752 | if (IS_ERR(kn)) { | 4387 | if (IS_ERR(kn)) { |
| 3753 | err = PTR_ERR(kn); | 4388 | ret = PTR_ERR(kn); |
| 3754 | goto err_free_id; | 4389 | goto out_free_id; |
| 3755 | } | 4390 | } |
| 3756 | cgrp->kn = kn; | 4391 | cgrp->kn = kn; |
| 3757 | 4392 | ||
| @@ -3761,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
| 3761 | */ | 4396 | */ |
| 3762 | kernfs_get(kn); | 4397 | kernfs_get(kn); |
| 3763 | 4398 | ||
| 3764 | cgrp->serial_nr = cgroup_serial_nr_next++; | 4399 | cgrp->self.serial_nr = css_serial_nr_next++; |
| 3765 | 4400 | ||
| 3766 | /* allocation complete, commit to creation */ | 4401 | /* allocation complete, commit to creation */ |
| 3767 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4402 | list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); |
| 3768 | atomic_inc(&root->nr_cgrps); | 4403 | atomic_inc(&root->nr_cgrps); |
| 3769 | cgroup_get(parent); | 4404 | cgroup_get(parent); |
| 3770 | 4405 | ||
| @@ -3772,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
| 3772 | * @cgrp is now fully operational. If something fails after this | 4407 | * @cgrp is now fully operational. If something fails after this |
| 3773 | * point, it'll be released via the normal destruction path. | 4408 | * point, it'll be released via the normal destruction path. |
| 3774 | */ | 4409 | */ |
| 3775 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); | 4410 | cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
| 3776 | 4411 | ||
| 3777 | err = cgroup_kn_set_ugid(kn); | 4412 | ret = cgroup_kn_set_ugid(kn); |
| 3778 | if (err) | 4413 | if (ret) |
| 3779 | goto err_destroy; | 4414 | goto out_destroy; |
| 3780 | 4415 | ||
| 3781 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | 4416 | ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); |
| 3782 | if (err) | 4417 | if (ret) |
| 3783 | goto err_destroy; | 4418 | goto out_destroy; |
| 3784 | 4419 | ||
| 3785 | /* let's create and online css's */ | 4420 | /* let's create and online css's */ |
| 3786 | for_each_subsys(ss, ssid) { | 4421 | for_each_subsys(ss, ssid) { |
| 3787 | if (root->cgrp.subsys_mask & (1 << ssid)) { | 4422 | if (parent->child_subsys_mask & (1 << ssid)) { |
| 3788 | err = create_css(cgrp, ss); | 4423 | ret = create_css(cgrp, ss); |
| 3789 | if (err) | 4424 | if (ret) |
| 3790 | goto err_destroy; | 4425 | goto out_destroy; |
| 3791 | } | 4426 | } |
| 3792 | } | 4427 | } |
| 3793 | 4428 | ||
| 3794 | kernfs_activate(kn); | 4429 | /* |
| 4430 | * On the default hierarchy, a child doesn't automatically inherit | ||
| 4431 | * child_subsys_mask from the parent. Each is configured manually. | ||
| 4432 | */ | ||
| 4433 | if (!cgroup_on_dfl(cgrp)) | ||
| 4434 | cgrp->child_subsys_mask = parent->child_subsys_mask; | ||
| 3795 | 4435 | ||
| 3796 | mutex_unlock(&cgroup_mutex); | 4436 | kernfs_activate(kn); |
| 3797 | mutex_unlock(&cgroup_tree_mutex); | ||
| 3798 | 4437 | ||
| 3799 | return 0; | 4438 | ret = 0; |
| 4439 | goto out_unlock; | ||
| 3800 | 4440 | ||
| 3801 | err_free_id: | 4441 | out_free_id: |
| 3802 | idr_remove(&root->cgroup_idr, cgrp->id); | 4442 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
| 3803 | err_unlock: | 4443 | out_cancel_ref: |
| 3804 | mutex_unlock(&cgroup_mutex); | 4444 | percpu_ref_cancel_init(&cgrp->self.refcnt); |
| 3805 | err_unlock_tree: | 4445 | out_free_cgrp: |
| 3806 | mutex_unlock(&cgroup_tree_mutex); | ||
| 3807 | kfree(cgrp); | 4446 | kfree(cgrp); |
| 3808 | return err; | 4447 | out_unlock: |
| 4448 | cgroup_kn_unlock(parent_kn); | ||
| 4449 | return ret; | ||
| 3809 | 4450 | ||
| 3810 | err_destroy: | 4451 | out_destroy: |
| 3811 | cgroup_destroy_locked(cgrp); | 4452 | cgroup_destroy_locked(cgrp); |
| 3812 | mutex_unlock(&cgroup_mutex); | 4453 | goto out_unlock; |
| 3813 | mutex_unlock(&cgroup_tree_mutex); | ||
| 3814 | return err; | ||
| 3815 | } | ||
| 3816 | |||
| 3817 | static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | ||
| 3818 | umode_t mode) | ||
| 3819 | { | ||
| 3820 | struct cgroup *parent = parent_kn->priv; | ||
| 3821 | int ret; | ||
| 3822 | |||
| 3823 | /* | ||
| 3824 | * cgroup_create() grabs cgroup_tree_mutex which nests outside | ||
| 3825 | * kernfs active_ref and cgroup_create() already synchronizes | ||
| 3826 | * properly against removal through cgroup_lock_live_group(). | ||
| 3827 | * Break it before calling cgroup_create(). | ||
| 3828 | */ | ||
| 3829 | cgroup_get(parent); | ||
| 3830 | kernfs_break_active_protection(parent_kn); | ||
| 3831 | |||
| 3832 | ret = cgroup_create(parent, name, mode); | ||
| 3833 | |||
| 3834 | kernfs_unbreak_active_protection(parent_kn); | ||
| 3835 | cgroup_put(parent); | ||
| 3836 | return ret; | ||
| 3837 | } | 4454 | } |
| 3838 | 4455 | ||
| 3839 | /* | 4456 | /* |
| 3840 | * This is called when the refcnt of a css is confirmed to be killed. | 4457 | * This is called when the refcnt of a css is confirmed to be killed. |
| 3841 | * css_tryget() is now guaranteed to fail. | 4458 | * css_tryget_online() is now guaranteed to fail. Tell the subsystem to |
| 4459 | * initate destruction and put the css ref from kill_css(). | ||
| 3842 | */ | 4460 | */ |
| 3843 | static void css_killed_work_fn(struct work_struct *work) | 4461 | static void css_killed_work_fn(struct work_struct *work) |
| 3844 | { | 4462 | { |
| 3845 | struct cgroup_subsys_state *css = | 4463 | struct cgroup_subsys_state *css = |
| 3846 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4464 | container_of(work, struct cgroup_subsys_state, destroy_work); |
| 3847 | struct cgroup *cgrp = css->cgroup; | ||
| 3848 | 4465 | ||
| 3849 | mutex_lock(&cgroup_tree_mutex); | ||
| 3850 | mutex_lock(&cgroup_mutex); | 4466 | mutex_lock(&cgroup_mutex); |
| 3851 | |||
| 3852 | /* | ||
| 3853 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
| 3854 | * initate destruction. | ||
| 3855 | */ | ||
| 3856 | offline_css(css); | 4467 | offline_css(css); |
| 3857 | |||
| 3858 | /* | ||
| 3859 | * If @cgrp is marked dead, it's waiting for refs of all css's to | ||
| 3860 | * be disabled before proceeding to the second phase of cgroup | ||
| 3861 | * destruction. If we are the last one, kick it off. | ||
| 3862 | */ | ||
| 3863 | if (!cgrp->nr_css && cgroup_is_dead(cgrp)) | ||
| 3864 | cgroup_destroy_css_killed(cgrp); | ||
| 3865 | |||
| 3866 | mutex_unlock(&cgroup_mutex); | 4468 | mutex_unlock(&cgroup_mutex); |
| 3867 | mutex_unlock(&cgroup_tree_mutex); | ||
| 3868 | 4469 | ||
| 3869 | /* | ||
| 3870 | * Put the css refs from kill_css(). Each css holds an extra | ||
| 3871 | * reference to the cgroup's dentry and cgroup removal proceeds | ||
| 3872 | * regardless of css refs. On the last put of each css, whenever | ||
| 3873 | * that may be, the extra dentry ref is put so that dentry | ||
| 3874 | * destruction happens only after all css's are released. | ||
| 3875 | */ | ||
| 3876 | css_put(css); | 4470 | css_put(css); |
| 3877 | } | 4471 | } |
| 3878 | 4472 | ||
| @@ -3886,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref) | |||
| 3886 | queue_work(cgroup_destroy_wq, &css->destroy_work); | 4480 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
| 3887 | } | 4481 | } |
| 3888 | 4482 | ||
| 3889 | static void __kill_css(struct cgroup_subsys_state *css) | 4483 | /** |
| 4484 | * kill_css - destroy a css | ||
| 4485 | * @css: css to destroy | ||
| 4486 | * | ||
| 4487 | * This function initiates destruction of @css by removing cgroup interface | ||
| 4488 | * files and putting its base reference. ->css_offline() will be invoked | ||
| 4489 | * asynchronously once css_tryget_online() is guaranteed to fail and when | ||
| 4490 | * the reference count reaches zero, @css will be released. | ||
| 4491 | */ | ||
| 4492 | static void kill_css(struct cgroup_subsys_state *css) | ||
| 3890 | { | 4493 | { |
| 3891 | lockdep_assert_held(&cgroup_tree_mutex); | 4494 | lockdep_assert_held(&cgroup_mutex); |
| 3892 | 4495 | ||
| 3893 | /* | 4496 | /* |
| 3894 | * This must happen before css is disassociated with its cgroup. | 4497 | * This must happen before css is disassociated with its cgroup. |
| @@ -3905,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css) | |||
| 3905 | /* | 4508 | /* |
| 3906 | * cgroup core guarantees that, by the time ->css_offline() is | 4509 | * cgroup core guarantees that, by the time ->css_offline() is |
| 3907 | * invoked, no new css reference will be given out via | 4510 | * invoked, no new css reference will be given out via |
| 3908 | * css_tryget(). We can't simply call percpu_ref_kill() and | 4511 | * css_tryget_online(). We can't simply call percpu_ref_kill() and |
| 3909 | * proceed to offlining css's because percpu_ref_kill() doesn't | 4512 | * proceed to offlining css's because percpu_ref_kill() doesn't |
| 3910 | * guarantee that the ref is seen as killed on all CPUs on return. | 4513 | * guarantee that the ref is seen as killed on all CPUs on return. |
| 3911 | * | 4514 | * |
| @@ -3916,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css) | |||
| 3916 | } | 4519 | } |
| 3917 | 4520 | ||
| 3918 | /** | 4521 | /** |
| 3919 | * kill_css - destroy a css | ||
| 3920 | * @css: css to destroy | ||
| 3921 | * | ||
| 3922 | * This function initiates destruction of @css by removing cgroup interface | ||
| 3923 | * files and putting its base reference. ->css_offline() will be invoked | ||
| 3924 | * asynchronously once css_tryget() is guaranteed to fail and when the | ||
| 3925 | * reference count reaches zero, @css will be released. | ||
| 3926 | */ | ||
| 3927 | static void kill_css(struct cgroup_subsys_state *css) | ||
| 3928 | { | ||
| 3929 | struct cgroup *cgrp = css->cgroup; | ||
| 3930 | |||
| 3931 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 3932 | |||
| 3933 | /* if already killed, noop */ | ||
| 3934 | if (cgrp->subsys_mask & (1 << css->ss->id)) { | ||
| 3935 | cgrp->subsys_mask &= ~(1 << css->ss->id); | ||
| 3936 | __kill_css(css); | ||
| 3937 | } | ||
| 3938 | } | ||
| 3939 | |||
| 3940 | /** | ||
| 3941 | * cgroup_destroy_locked - the first stage of cgroup destruction | 4522 | * cgroup_destroy_locked - the first stage of cgroup destruction |
| 3942 | * @cgrp: cgroup to be destroyed | 4523 | * @cgrp: cgroup to be destroyed |
| 3943 | * | 4524 | * |
| 3944 | * css's make use of percpu refcnts whose killing latency shouldn't be | 4525 | * css's make use of percpu refcnts whose killing latency shouldn't be |
| 3945 | * exposed to userland and are RCU protected. Also, cgroup core needs to | 4526 | * exposed to userland and are RCU protected. Also, cgroup core needs to |
| 3946 | * guarantee that css_tryget() won't succeed by the time ->css_offline() is | 4527 | * guarantee that css_tryget_online() won't succeed by the time |
| 3947 | * invoked. To satisfy all the requirements, destruction is implemented in | 4528 | * ->css_offline() is invoked. To satisfy all the requirements, |
| 3948 | * the following two steps. | 4529 | * destruction is implemented in the following two steps. |
| 3949 | * | 4530 | * |
| 3950 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all | 4531 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all |
| 3951 | * userland visible parts and start killing the percpu refcnts of | 4532 | * userland visible parts and start killing the percpu refcnts of |
| @@ -3964,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css) | |||
| 3964 | static int cgroup_destroy_locked(struct cgroup *cgrp) | 4545 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
| 3965 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4546 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
| 3966 | { | 4547 | { |
| 3967 | struct cgroup *child; | ||
| 3968 | struct cgroup_subsys_state *css; | 4548 | struct cgroup_subsys_state *css; |
| 3969 | bool empty; | 4549 | bool empty; |
| 3970 | int ssid; | 4550 | int ssid; |
| 3971 | 4551 | ||
| 3972 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 3973 | lockdep_assert_held(&cgroup_mutex); | 4552 | lockdep_assert_held(&cgroup_mutex); |
| 3974 | 4553 | ||
| 3975 | /* | 4554 | /* |
| @@ -3983,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 3983 | return -EBUSY; | 4562 | return -EBUSY; |
| 3984 | 4563 | ||
| 3985 | /* | 4564 | /* |
| 3986 | * Make sure there's no live children. We can't test ->children | 4565 | * Make sure there's no live children. We can't test emptiness of |
| 3987 | * emptiness as dead children linger on it while being destroyed; | 4566 | * ->self.children as dead children linger on it while being |
| 3988 | * otherwise, "rmdir parent/child parent" may fail with -EBUSY. | 4567 | * drained; otherwise, "rmdir parent/child parent" may fail. |
| 3989 | */ | 4568 | */ |
| 3990 | empty = true; | 4569 | if (css_has_online_children(&cgrp->self)) |
| 3991 | rcu_read_lock(); | ||
| 3992 | list_for_each_entry_rcu(child, &cgrp->children, sibling) { | ||
| 3993 | empty = cgroup_is_dead(child); | ||
| 3994 | if (!empty) | ||
| 3995 | break; | ||
| 3996 | } | ||
| 3997 | rcu_read_unlock(); | ||
| 3998 | if (!empty) | ||
| 3999 | return -EBUSY; | 4570 | return -EBUSY; |
| 4000 | 4571 | ||
| 4001 | /* | 4572 | /* |
| 4002 | * Mark @cgrp dead. This prevents further task migration and child | 4573 | * Mark @cgrp dead. This prevents further task migration and child |
| 4003 | * creation by disabling cgroup_lock_live_group(). Note that | 4574 | * creation by disabling cgroup_lock_live_group(). |
| 4004 | * CGRP_DEAD assertion is depended upon by css_next_child() to | ||
| 4005 | * resume iteration after dropping RCU read lock. See | ||
| 4006 | * css_next_child() for details. | ||
| 4007 | */ | 4575 | */ |
| 4008 | set_bit(CGRP_DEAD, &cgrp->flags); | 4576 | cgrp->self.flags &= ~CSS_ONLINE; |
| 4009 | 4577 | ||
| 4010 | /* | 4578 | /* initiate massacre of all css's */ |
| 4011 | * Initiate massacre of all css's. cgroup_destroy_css_killed() | ||
| 4012 | * will be invoked to perform the rest of destruction once the | ||
| 4013 | * percpu refs of all css's are confirmed to be killed. This | ||
| 4014 | * involves removing the subsystem's files, drop cgroup_mutex. | ||
| 4015 | */ | ||
| 4016 | mutex_unlock(&cgroup_mutex); | ||
| 4017 | for_each_css(css, ssid, cgrp) | 4579 | for_each_css(css, ssid, cgrp) |
| 4018 | kill_css(css); | 4580 | kill_css(css); |
| 4019 | mutex_lock(&cgroup_mutex); | ||
| 4020 | 4581 | ||
| 4021 | /* CGRP_DEAD is set, remove from ->release_list for the last time */ | 4582 | /* CSS_ONLINE is clear, remove from ->release_list for the last time */ |
| 4022 | raw_spin_lock(&release_list_lock); | 4583 | raw_spin_lock(&release_list_lock); |
| 4023 | if (!list_empty(&cgrp->release_list)) | 4584 | if (!list_empty(&cgrp->release_list)) |
| 4024 | list_del_init(&cgrp->release_list); | 4585 | list_del_init(&cgrp->release_list); |
| 4025 | raw_spin_unlock(&release_list_lock); | 4586 | raw_spin_unlock(&release_list_lock); |
| 4026 | 4587 | ||
| 4027 | /* | 4588 | /* |
| 4028 | * If @cgrp has css's attached, the second stage of cgroup | 4589 | * Remove @cgrp directory along with the base files. @cgrp has an |
| 4029 | * destruction is kicked off from css_killed_work_fn() after the | 4590 | * extra ref on its kn. |
| 4030 | * refs of all attached css's are killed. If @cgrp doesn't have | ||
| 4031 | * any css, we kick it off here. | ||
| 4032 | */ | 4591 | */ |
| 4033 | if (!cgrp->nr_css) | 4592 | kernfs_remove(cgrp->kn); |
| 4034 | cgroup_destroy_css_killed(cgrp); | ||
| 4035 | |||
| 4036 | /* remove @cgrp directory along with the base files */ | ||
| 4037 | mutex_unlock(&cgroup_mutex); | ||
| 4038 | 4593 | ||
| 4039 | /* | 4594 | set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); |
| 4040 | * There are two control paths which try to determine cgroup from | 4595 | check_for_release(cgroup_parent(cgrp)); |
| 4041 | * dentry without going through kernfs - cgroupstats_build() and | ||
| 4042 | * css_tryget_from_dir(). Those are supported by RCU protecting | ||
| 4043 | * clearing of cgrp->kn->priv backpointer, which should happen | ||
| 4044 | * after all files under it have been removed. | ||
| 4045 | */ | ||
| 4046 | kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ | ||
| 4047 | RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); | ||
| 4048 | 4596 | ||
| 4049 | mutex_lock(&cgroup_mutex); | 4597 | /* put the base reference */ |
| 4598 | percpu_ref_kill(&cgrp->self.refcnt); | ||
| 4050 | 4599 | ||
| 4051 | return 0; | 4600 | return 0; |
| 4052 | }; | 4601 | }; |
| 4053 | 4602 | ||
| 4054 | /** | ||
| 4055 | * cgroup_destroy_css_killed - the second step of cgroup destruction | ||
| 4056 | * @work: cgroup->destroy_free_work | ||
| 4057 | * | ||
| 4058 | * This function is invoked from a work item for a cgroup which is being | ||
| 4059 | * destroyed after all css's are offlined and performs the rest of | ||
| 4060 | * destruction. This is the second step of destruction described in the | ||
| 4061 | * comment above cgroup_destroy_locked(). | ||
| 4062 | */ | ||
| 4063 | static void cgroup_destroy_css_killed(struct cgroup *cgrp) | ||
| 4064 | { | ||
| 4065 | struct cgroup *parent = cgrp->parent; | ||
| 4066 | |||
| 4067 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 4068 | lockdep_assert_held(&cgroup_mutex); | ||
| 4069 | |||
| 4070 | /* delete this cgroup from parent->children */ | ||
| 4071 | list_del_rcu(&cgrp->sibling); | ||
| 4072 | |||
| 4073 | cgroup_put(cgrp); | ||
| 4074 | |||
| 4075 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
| 4076 | check_for_release(parent); | ||
| 4077 | } | ||
| 4078 | |||
| 4079 | static int cgroup_rmdir(struct kernfs_node *kn) | 4603 | static int cgroup_rmdir(struct kernfs_node *kn) |
| 4080 | { | 4604 | { |
| 4081 | struct cgroup *cgrp = kn->priv; | 4605 | struct cgroup *cgrp; |
| 4082 | int ret = 0; | 4606 | int ret = 0; |
| 4083 | 4607 | ||
| 4084 | /* | 4608 | cgrp = cgroup_kn_lock_live(kn); |
| 4085 | * This is self-destruction but @kn can't be removed while this | 4609 | if (!cgrp) |
| 4086 | * callback is in progress. Let's break active protection. Once | 4610 | return 0; |
| 4087 | * the protection is broken, @cgrp can be destroyed at any point. | 4611 | cgroup_get(cgrp); /* for @kn->priv clearing */ |
| 4088 | * Pin it so that it stays accessible. | ||
| 4089 | */ | ||
| 4090 | cgroup_get(cgrp); | ||
| 4091 | kernfs_break_active_protection(kn); | ||
| 4092 | 4612 | ||
| 4093 | mutex_lock(&cgroup_tree_mutex); | 4613 | ret = cgroup_destroy_locked(cgrp); |
| 4094 | mutex_lock(&cgroup_mutex); | 4614 | |
| 4615 | cgroup_kn_unlock(kn); | ||
| 4095 | 4616 | ||
| 4096 | /* | 4617 | /* |
| 4097 | * @cgrp might already have been destroyed while we're trying to | 4618 | * There are two control paths which try to determine cgroup from |
| 4098 | * grab the mutexes. | 4619 | * dentry without going through kernfs - cgroupstats_build() and |
| 4620 | * css_tryget_online_from_dir(). Those are supported by RCU | ||
| 4621 | * protecting clearing of cgrp->kn->priv backpointer, which should | ||
| 4622 | * happen after all files under it have been removed. | ||
| 4099 | */ | 4623 | */ |
| 4100 | if (!cgroup_is_dead(cgrp)) | 4624 | if (!ret) |
| 4101 | ret = cgroup_destroy_locked(cgrp); | 4625 | RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); |
| 4102 | |||
| 4103 | mutex_unlock(&cgroup_mutex); | ||
| 4104 | mutex_unlock(&cgroup_tree_mutex); | ||
| 4105 | 4626 | ||
| 4106 | kernfs_unbreak_active_protection(kn); | ||
| 4107 | cgroup_put(cgrp); | 4627 | cgroup_put(cgrp); |
| 4108 | return ret; | 4628 | return ret; |
| 4109 | } | 4629 | } |
| @@ -4116,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { | |||
| 4116 | .rename = cgroup_rename, | 4636 | .rename = cgroup_rename, |
| 4117 | }; | 4637 | }; |
| 4118 | 4638 | ||
| 4119 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4639 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) |
| 4120 | { | 4640 | { |
| 4121 | struct cgroup_subsys_state *css; | 4641 | struct cgroup_subsys_state *css; |
| 4122 | 4642 | ||
| 4123 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4643 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
| 4124 | 4644 | ||
| 4125 | mutex_lock(&cgroup_tree_mutex); | ||
| 4126 | mutex_lock(&cgroup_mutex); | 4645 | mutex_lock(&cgroup_mutex); |
| 4127 | 4646 | ||
| 4647 | idr_init(&ss->css_idr); | ||
| 4128 | INIT_LIST_HEAD(&ss->cfts); | 4648 | INIT_LIST_HEAD(&ss->cfts); |
| 4129 | 4649 | ||
| 4130 | /* Create the root cgroup state for this subsystem */ | 4650 | /* Create the root cgroup state for this subsystem */ |
| @@ -4132,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
| 4132 | css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); | 4652 | css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); |
| 4133 | /* We don't handle early failures gracefully */ | 4653 | /* We don't handle early failures gracefully */ |
| 4134 | BUG_ON(IS_ERR(css)); | 4654 | BUG_ON(IS_ERR(css)); |
| 4135 | init_css(css, ss, &cgrp_dfl_root.cgrp); | 4655 | init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); |
| 4656 | |||
| 4657 | /* | ||
| 4658 | * Root csses are never destroyed and we can't initialize | ||
| 4659 | * percpu_ref during early init. Disable refcnting. | ||
| 4660 | */ | ||
| 4661 | css->flags |= CSS_NO_REF; | ||
| 4662 | |||
| 4663 | if (early) { | ||
| 4664 | /* allocation can't be done safely during early init */ | ||
| 4665 | css->id = 1; | ||
| 4666 | } else { | ||
| 4667 | css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); | ||
| 4668 | BUG_ON(css->id < 0); | ||
| 4669 | } | ||
| 4136 | 4670 | ||
| 4137 | /* Update the init_css_set to contain a subsys | 4671 | /* Update the init_css_set to contain a subsys |
| 4138 | * pointer to this state - since the subsystem is | 4672 | * pointer to this state - since the subsystem is |
| @@ -4149,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
| 4149 | 4683 | ||
| 4150 | BUG_ON(online_css(css)); | 4684 | BUG_ON(online_css(css)); |
| 4151 | 4685 | ||
| 4152 | cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; | ||
| 4153 | |||
| 4154 | mutex_unlock(&cgroup_mutex); | 4686 | mutex_unlock(&cgroup_mutex); |
| 4155 | mutex_unlock(&cgroup_tree_mutex); | ||
| 4156 | } | 4687 | } |
| 4157 | 4688 | ||
| 4158 | /** | 4689 | /** |
| @@ -4169,6 +4700,8 @@ int __init cgroup_init_early(void) | |||
| 4169 | int i; | 4700 | int i; |
| 4170 | 4701 | ||
| 4171 | init_cgroup_root(&cgrp_dfl_root, &opts); | 4702 | init_cgroup_root(&cgrp_dfl_root, &opts); |
| 4703 | cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; | ||
| 4704 | |||
| 4172 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); | 4705 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); |
| 4173 | 4706 | ||
| 4174 | for_each_subsys(ss, i) { | 4707 | for_each_subsys(ss, i) { |
| @@ -4183,7 +4716,7 @@ int __init cgroup_init_early(void) | |||
| 4183 | ss->name = cgroup_subsys_name[i]; | 4716 | ss->name = cgroup_subsys_name[i]; |
| 4184 | 4717 | ||
| 4185 | if (ss->early_init) | 4718 | if (ss->early_init) |
| 4186 | cgroup_init_subsys(ss); | 4719 | cgroup_init_subsys(ss, true); |
| 4187 | } | 4720 | } |
| 4188 | return 0; | 4721 | return 0; |
| 4189 | } | 4722 | } |
| @@ -4202,7 +4735,6 @@ int __init cgroup_init(void) | |||
| 4202 | 4735 | ||
| 4203 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); | 4736 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); |
| 4204 | 4737 | ||
| 4205 | mutex_lock(&cgroup_tree_mutex); | ||
| 4206 | mutex_lock(&cgroup_mutex); | 4738 | mutex_lock(&cgroup_mutex); |
| 4207 | 4739 | ||
| 4208 | /* Add init_css_set to the hash table */ | 4740 | /* Add init_css_set to the hash table */ |
| @@ -4212,18 +4744,31 @@ int __init cgroup_init(void) | |||
| 4212 | BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); | 4744 | BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); |
| 4213 | 4745 | ||
| 4214 | mutex_unlock(&cgroup_mutex); | 4746 | mutex_unlock(&cgroup_mutex); |
| 4215 | mutex_unlock(&cgroup_tree_mutex); | ||
| 4216 | 4747 | ||
| 4217 | for_each_subsys(ss, ssid) { | 4748 | for_each_subsys(ss, ssid) { |
| 4218 | if (!ss->early_init) | 4749 | if (ss->early_init) { |
| 4219 | cgroup_init_subsys(ss); | 4750 | struct cgroup_subsys_state *css = |
| 4751 | init_css_set.subsys[ss->id]; | ||
| 4752 | |||
| 4753 | css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, | ||
| 4754 | GFP_KERNEL); | ||
| 4755 | BUG_ON(css->id < 0); | ||
| 4756 | } else { | ||
| 4757 | cgroup_init_subsys(ss, false); | ||
| 4758 | } | ||
| 4759 | |||
| 4760 | list_add_tail(&init_css_set.e_cset_node[ssid], | ||
| 4761 | &cgrp_dfl_root.cgrp.e_csets[ssid]); | ||
| 4220 | 4762 | ||
| 4221 | /* | 4763 | /* |
| 4222 | * cftype registration needs kmalloc and can't be done | 4764 | * Setting dfl_root subsys_mask needs to consider the |
| 4223 | * during early_init. Register base cftypes separately. | 4765 | * disabled flag and cftype registration needs kmalloc, |
| 4766 | * both of which aren't available during early_init. | ||
| 4224 | */ | 4767 | */ |
| 4225 | if (ss->base_cftypes) | 4768 | if (!ss->disabled) { |
| 4769 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; | ||
| 4226 | WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); | 4770 | WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); |
| 4771 | } | ||
| 4227 | } | 4772 | } |
| 4228 | 4773 | ||
| 4229 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 4774 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
| @@ -4306,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
| 4306 | 4851 | ||
| 4307 | seq_printf(m, "%d:", root->hierarchy_id); | 4852 | seq_printf(m, "%d:", root->hierarchy_id); |
| 4308 | for_each_subsys(ss, ssid) | 4853 | for_each_subsys(ss, ssid) |
| 4309 | if (root->cgrp.subsys_mask & (1 << ssid)) | 4854 | if (root->subsys_mask & (1 << ssid)) |
| 4310 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 4855 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
| 4311 | if (strlen(root->name)) | 4856 | if (strlen(root->name)) |
| 4312 | seq_printf(m, "%sname=%s", count ? "," : "", | 4857 | seq_printf(m, "%sname=%s", count ? "," : "", |
| @@ -4501,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk) | |||
| 4501 | 5046 | ||
| 4502 | static void check_for_release(struct cgroup *cgrp) | 5047 | static void check_for_release(struct cgroup *cgrp) |
| 4503 | { | 5048 | { |
| 4504 | if (cgroup_is_releasable(cgrp) && | 5049 | if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && |
| 4505 | list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { | 5050 | !css_has_online_children(&cgrp->self)) { |
| 4506 | /* | 5051 | /* |
| 4507 | * Control Group is currently removeable. If it's not | 5052 | * Control Group is currently removeable. If it's not |
| 4508 | * already queued for a userspace notification, queue | 5053 | * already queued for a userspace notification, queue |
| @@ -4619,7 +5164,7 @@ static int __init cgroup_disable(char *str) | |||
| 4619 | __setup("cgroup_disable=", cgroup_disable); | 5164 | __setup("cgroup_disable=", cgroup_disable); |
| 4620 | 5165 | ||
| 4621 | /** | 5166 | /** |
| 4622 | * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir | 5167 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
| 4623 | * @dentry: directory dentry of interest | 5168 | * @dentry: directory dentry of interest |
| 4624 | * @ss: subsystem of interest | 5169 | * @ss: subsystem of interest |
| 4625 | * | 5170 | * |
| @@ -4627,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable); | |||
| 4627 | * to get the corresponding css and return it. If such css doesn't exist | 5172 | * to get the corresponding css and return it. If such css doesn't exist |
| 4628 | * or can't be pinned, an ERR_PTR value is returned. | 5173 | * or can't be pinned, an ERR_PTR value is returned. |
| 4629 | */ | 5174 | */ |
| 4630 | struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | 5175 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, |
| 4631 | struct cgroup_subsys *ss) | 5176 | struct cgroup_subsys *ss) |
| 4632 | { | 5177 | { |
| 4633 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); | 5178 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); |
| 4634 | struct cgroup_subsys_state *css = NULL; | 5179 | struct cgroup_subsys_state *css = NULL; |
| @@ -4644,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | |||
| 4644 | /* | 5189 | /* |
| 4645 | * This path doesn't originate from kernfs and @kn could already | 5190 | * This path doesn't originate from kernfs and @kn could already |
| 4646 | * have been or be removed at any point. @kn->priv is RCU | 5191 | * have been or be removed at any point. @kn->priv is RCU |
| 4647 | * protected for this access. See destroy_locked() for details. | 5192 | * protected for this access. See cgroup_rmdir() for details. |
| 4648 | */ | 5193 | */ |
| 4649 | cgrp = rcu_dereference(kn->priv); | 5194 | cgrp = rcu_dereference(kn->priv); |
| 4650 | if (cgrp) | 5195 | if (cgrp) |
| 4651 | css = cgroup_css(cgrp, ss); | 5196 | css = cgroup_css(cgrp, ss); |
| 4652 | 5197 | ||
| 4653 | if (!css || !css_tryget(css)) | 5198 | if (!css || !css_tryget_online(css)) |
| 4654 | css = ERR_PTR(-ENOENT); | 5199 | css = ERR_PTR(-ENOENT); |
| 4655 | 5200 | ||
| 4656 | rcu_read_unlock(); | 5201 | rcu_read_unlock(); |
| @@ -4667,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | |||
| 4667 | */ | 5212 | */ |
| 4668 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | 5213 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) |
| 4669 | { | 5214 | { |
| 4670 | struct cgroup *cgrp; | 5215 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 4671 | 5216 | return idr_find(&ss->css_idr, id); | |
| 4672 | cgroup_assert_mutexes_or_rcu_locked(); | ||
| 4673 | |||
| 4674 | cgrp = idr_find(&ss->root->cgroup_idr, id); | ||
| 4675 | if (cgrp) | ||
| 4676 | return cgroup_css(cgrp, ss); | ||
| 4677 | return NULL; | ||
| 4678 | } | 5217 | } |
| 4679 | 5218 | ||
| 4680 | #ifdef CONFIG_CGROUP_DEBUG | 5219 | #ifdef CONFIG_CGROUP_DEBUG |
