aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/cgroup.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/cgroup.h')
-rw-r--r--include/linux/cgroup.h229
1 files changed, 139 insertions, 90 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8bda1294c035..297462b9f41a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
20#include <linux/workqueue.h> 20#include <linux/workqueue.h>
21#include <linux/xattr.h> 21#include <linux/xattr.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/percpu-refcount.h>
23 24
24#ifdef CONFIG_CGROUPS 25#ifdef CONFIG_CGROUPS
25 26
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
72 */ 73 */
73 struct cgroup *cgroup; 74 struct cgroup *cgroup;
74 75
75 /* 76 /* reference count - access via css_[try]get() and css_put() */
76 * State maintained by the cgroup system to allow subsystems 77 struct percpu_ref refcnt;
77 * to be "busy". Should be accessed via css_get(),
78 * css_tryget() and css_put().
79 */
80
81 atomic_t refcnt;
82 78
83 unsigned long flags; 79 unsigned long flags;
84 /* ID for this css, if possible */ 80 /* ID for this css, if possible */
@@ -94,56 +90,52 @@ enum {
94 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ 90 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
95}; 91};
96 92
97/* Caller must verify that the css is not for root cgroup */ 93/**
98static inline void __css_get(struct cgroup_subsys_state *css, int count) 94 * css_get - obtain a reference on the specified css
99{ 95 * @css: target css
100 atomic_add(count, &css->refcnt); 96 *
101} 97 * The caller must already have a reference.
102
103/*
104 * Call css_get() to hold a reference on the css; it can be used
105 * for a reference obtained via:
106 * - an existing ref-counted reference to the css
107 * - task->cgroups for a locked task
108 */ 98 */
109
110static inline void css_get(struct cgroup_subsys_state *css) 99static inline void css_get(struct cgroup_subsys_state *css)
111{ 100{
112 /* We don't need to reference count the root state */ 101 /* We don't need to reference count the root state */
113 if (!(css->flags & CSS_ROOT)) 102 if (!(css->flags & CSS_ROOT))
114 __css_get(css, 1); 103 percpu_ref_get(&css->refcnt);
115} 104}
116 105
117/* 106/**
118 * Call css_tryget() to take a reference on a css if your existing 107 * css_tryget - try to obtain a reference on the specified css
119 * (known-valid) reference isn't already ref-counted. Returns false if 108 * @css: target css
120 * the css has been destroyed. 109 *
110 * Obtain a reference on @css if it's alive. The caller naturally needs to
111 * ensure that @css is accessible but doesn't have to be holding a
112 * reference on it - IOW, RCU protected access is good enough for this
113 * function. Returns %true if a reference count was successfully obtained;
114 * %false otherwise.
121 */ 115 */
122
123extern bool __css_tryget(struct cgroup_subsys_state *css);
124static inline bool css_tryget(struct cgroup_subsys_state *css) 116static inline bool css_tryget(struct cgroup_subsys_state *css)
125{ 117{
126 if (css->flags & CSS_ROOT) 118 if (css->flags & CSS_ROOT)
127 return true; 119 return true;
128 return __css_tryget(css); 120 return percpu_ref_tryget(&css->refcnt);
129} 121}
130 122
131/* 123/**
132 * css_put() should be called to release a reference taken by 124 * css_put - put a css reference
133 * css_get() or css_tryget() 125 * @css: target css
126 *
127 * Put a reference obtained via css_get() and css_tryget().
134 */ 128 */
135
136extern void __css_put(struct cgroup_subsys_state *css);
137static inline void css_put(struct cgroup_subsys_state *css) 129static inline void css_put(struct cgroup_subsys_state *css)
138{ 130{
139 if (!(css->flags & CSS_ROOT)) 131 if (!(css->flags & CSS_ROOT))
140 __css_put(css); 132 percpu_ref_put(&css->refcnt);
141} 133}
142 134
143/* bits in struct cgroup flags field */ 135/* bits in struct cgroup flags field */
144enum { 136enum {
145 /* Control Group is dead */ 137 /* Control Group is dead */
146 CGRP_REMOVED, 138 CGRP_DEAD,
147 /* 139 /*
148 * Control Group has previously had a child cgroup or a task, 140 * Control Group has previously had a child cgroup or a task,
149 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) 141 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
@@ -169,12 +161,6 @@ struct cgroup_name {
169struct cgroup { 161struct cgroup {
170 unsigned long flags; /* "unsigned long" so bitops work */ 162 unsigned long flags; /* "unsigned long" so bitops work */
171 163
172 /*
173 * count users of this cgroup. >0 means busy, but doesn't
174 * necessarily indicate the number of tasks in the cgroup
175 */
176 atomic_t count;
177
178 int id; /* ida allocated in-hierarchy ID */ 164 int id; /* ida allocated in-hierarchy ID */
179 165
180 /* 166 /*
@@ -189,6 +175,14 @@ struct cgroup {
189 struct dentry *dentry; /* cgroup fs entry, RCU protected */ 175 struct dentry *dentry; /* cgroup fs entry, RCU protected */
190 176
191 /* 177 /*
178 * Monotonically increasing unique serial number which defines a
179 * uniform order among all cgroups. It's guaranteed that all
180 * ->children lists are in the ascending order of ->serial_nr.
181 * It's used to allow interrupting and resuming iterations.
182 */
183 u64 serial_nr;
184
185 /*
192 * This is a copy of dentry->d_name, and it's needed because 186 * This is a copy of dentry->d_name, and it's needed because
193 * we can't use dentry->d_name in cgroup_path(). 187 * we can't use dentry->d_name in cgroup_path().
194 * 188 *
@@ -207,13 +201,10 @@ struct cgroup {
207 struct cgroupfs_root *root; 201 struct cgroupfs_root *root;
208 202
209 /* 203 /*
210 * List of cg_cgroup_links pointing at css_sets with 204 * List of cgrp_cset_links pointing at css_sets with tasks in this
211 * tasks in this cgroup. Protected by css_set_lock 205 * cgroup. Protected by css_set_lock.
212 */ 206 */
213 struct list_head css_sets; 207 struct list_head cset_links;
214
215 struct list_head allcg_node; /* cgroupfs_root->allcg_list */
216 struct list_head cft_q_node; /* used during cftype add/rm */
217 208
218 /* 209 /*
219 * Linked list running through all cgroups that can 210 * Linked list running through all cgroups that can
@@ -229,9 +220,10 @@ struct cgroup {
229 struct list_head pidlists; 220 struct list_head pidlists;
230 struct mutex pidlist_mutex; 221 struct mutex pidlist_mutex;
231 222
232 /* For RCU-protected deletion */ 223 /* For css percpu_ref killing and RCU-protected deletion */
233 struct rcu_head rcu_head; 224 struct rcu_head rcu_head;
234 struct work_struct free_work; 225 struct work_struct destroy_work;
226 atomic_t css_kill_cnt;
235 227
236 /* List of events which userspace want to receive */ 228 /* List of events which userspace want to receive */
237 struct list_head event_list; 229 struct list_head event_list;
@@ -269,18 +261,35 @@ enum {
269 * 261 *
270 * - Remount is disallowed. 262 * - Remount is disallowed.
271 * 263 *
264 * - rename(2) is disallowed.
265 *
266 * - "tasks" is removed. Everything should be at process
267 * granularity. Use "cgroup.procs" instead.
268 *
269 * - "release_agent" and "notify_on_release" are removed.
270 * Replacement notification mechanism will be implemented.
271 *
272 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
273 * and take masks of ancestors with non-empty cpus/mems, instead of
274 * being moved to an ancestor.
275 *
276 * - cpuset: a task can be moved into an empty cpuset, and again it
277 * takes masks of ancestors.
278 *
272 * - memcg: use_hierarchy is on by default and the cgroup file for 279 * - memcg: use_hierarchy is on by default and the cgroup file for
273 * the flag is not created. 280 * the flag is not created.
274 * 281 *
275 * The followings are planned changes. 282 * - blkcg: blk-throttle becomes properly hierarchical.
276 *
277 * - release_agent will be disallowed once replacement notification
278 * mechanism is implemented.
279 */ 283 */
280 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), 284 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
281 285
282 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 286 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
283 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 287 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
288
289 /* mount options live below bit 16 */
290 CGRP_ROOT_OPTION_MASK = (1 << 16) - 1,
291
292 CGRP_ROOT_SUBSYS_BOUND = (1 << 16), /* subsystems finished binding */
284}; 293};
285 294
286/* 295/*
@@ -291,18 +300,12 @@ enum {
291struct cgroupfs_root { 300struct cgroupfs_root {
292 struct super_block *sb; 301 struct super_block *sb;
293 302
294 /* 303 /* The bitmask of subsystems attached to this hierarchy */
295 * The bitmask of subsystems intended to be attached to this
296 * hierarchy
297 */
298 unsigned long subsys_mask; 304 unsigned long subsys_mask;
299 305
300 /* Unique id for this hierarchy. */ 306 /* Unique id for this hierarchy. */
301 int hierarchy_id; 307 int hierarchy_id;
302 308
303 /* The bitmask of subsystems currently attached to this hierarchy */
304 unsigned long actual_subsys_mask;
305
306 /* A list running through the attached subsystems */ 309 /* A list running through the attached subsystems */
307 struct list_head subsys_list; 310 struct list_head subsys_list;
308 311
@@ -315,9 +318,6 @@ struct cgroupfs_root {
315 /* A list running through the active hierarchies */ 318 /* A list running through the active hierarchies */
316 struct list_head root_list; 319 struct list_head root_list;
317 320
318 /* All cgroups on this root, cgroup_mutex protected */
319 struct list_head allcg_list;
320
321 /* Hierarchy-specific flags */ 321 /* Hierarchy-specific flags */
322 unsigned long flags; 322 unsigned long flags;
323 323
@@ -357,11 +357,10 @@ struct css_set {
357 struct list_head tasks; 357 struct list_head tasks;
358 358
359 /* 359 /*
360 * List of cg_cgroup_link objects on link chains from 360 * List of cgrp_cset_links pointing at cgroups referenced from this
361 * cgroups referenced from this css_set. Protected by 361 * css_set. Protected by css_set_lock.
362 * css_set_lock
363 */ 362 */
364 struct list_head cg_links; 363 struct list_head cgrp_links;
365 364
366 /* 365 /*
367 * Set of subsystem states, one for each subsystem. This array 366 * Set of subsystem states, one for each subsystem. This array
@@ -394,9 +393,11 @@ struct cgroup_map_cb {
394 */ 393 */
395 394
396/* cftype->flags */ 395/* cftype->flags */
397#define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ 396enum {
398#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */ 397 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cg */
399#define CFTYPE_INSANE (1U << 2) /* don't create if sane_behavior */ 398 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cg */
399 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */
400};
400 401
401#define MAX_CFTYPE_NAME 64 402#define MAX_CFTYPE_NAME 64
402 403
@@ -442,13 +443,13 @@ struct cftype {
442 * entry. The key/value pairs (and their ordering) should not 443 * entry. The key/value pairs (and their ordering) should not
443 * change between reboots. 444 * change between reboots.
444 */ 445 */
445 int (*read_map)(struct cgroup *cont, struct cftype *cft, 446 int (*read_map)(struct cgroup *cgrp, struct cftype *cft,
446 struct cgroup_map_cb *cb); 447 struct cgroup_map_cb *cb);
447 /* 448 /*
448 * read_seq_string() is used for outputting a simple sequence 449 * read_seq_string() is used for outputting a simple sequence
449 * using seqfile. 450 * using seqfile.
450 */ 451 */
451 int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, 452 int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft,
452 struct seq_file *m); 453 struct seq_file *m);
453 454
454 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, 455 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft,
@@ -538,10 +539,11 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
538int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 539int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
539int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 540int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
540 541
541int cgroup_is_removed(const struct cgroup *cgrp);
542bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); 542bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
543 543
544int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); 544int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
545int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
546 char *buf, size_t buflen);
545 547
546int cgroup_task_count(const struct cgroup *cgrp); 548int cgroup_task_count(const struct cgroup *cgrp);
547 549
@@ -646,22 +648,60 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
646 return cgrp->subsys[subsys_id]; 648 return cgrp->subsys[subsys_id];
647} 649}
648 650
649/* 651/**
650 * function to get the cgroup_subsys_state which allows for extra 652 * task_css_set_check - obtain a task's css_set with extra access conditions
651 * rcu_dereference_check() conditions, such as locks used during the 653 * @task: the task to obtain css_set for
652 * cgroup_subsys::attach() methods. 654 * @__c: extra condition expression to be passed to rcu_dereference_check()
655 *
656 * A task's css_set is RCU protected, initialized and exited while holding
657 * task_lock(), and can only be modified while holding both cgroup_mutex
658 * and task_lock() while the task is alive. This macro verifies that the
659 * caller is inside proper critical section and returns @task's css_set.
660 *
661 * The caller can also specify additional allowed conditions via @__c, such
662 * as locks used during the cgroup_subsys::attach() methods.
653 */ 663 */
654#ifdef CONFIG_PROVE_RCU 664#ifdef CONFIG_PROVE_RCU
655extern struct mutex cgroup_mutex; 665extern struct mutex cgroup_mutex;
656#define task_subsys_state_check(task, subsys_id, __c) \ 666#define task_css_set_check(task, __c) \
657 rcu_dereference_check((task)->cgroups->subsys[(subsys_id)], \ 667 rcu_dereference_check((task)->cgroups, \
658 lockdep_is_held(&(task)->alloc_lock) || \ 668 lockdep_is_held(&(task)->alloc_lock) || \
659 lockdep_is_held(&cgroup_mutex) || (__c)) 669 lockdep_is_held(&cgroup_mutex) || (__c))
660#else 670#else
661#define task_subsys_state_check(task, subsys_id, __c) \ 671#define task_css_set_check(task, __c) \
662 rcu_dereference((task)->cgroups->subsys[(subsys_id)]) 672 rcu_dereference((task)->cgroups)
663#endif 673#endif
664 674
675/**
676 * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds
677 * @task: the target task
678 * @subsys_id: the target subsystem ID
679 * @__c: extra condition expression to be passed to rcu_dereference_check()
680 *
681 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The
682 * synchronization rules are the same as task_css_set_check().
683 */
684#define task_subsys_state_check(task, subsys_id, __c) \
685 task_css_set_check((task), (__c))->subsys[(subsys_id)]
686
687/**
688 * task_css_set - obtain a task's css_set
689 * @task: the task to obtain css_set for
690 *
691 * See task_css_set_check().
692 */
693static inline struct css_set *task_css_set(struct task_struct *task)
694{
695 return task_css_set_check(task, false);
696}
697
698/**
699 * task_subsys_state - obtain css for (task, subsys)
700 * @task: the target task
701 * @subsys_id: the target subsystem ID
702 *
703 * See task_subsys_state_check().
704 */
665static inline struct cgroup_subsys_state * 705static inline struct cgroup_subsys_state *
666task_subsys_state(struct task_struct *task, int subsys_id) 706task_subsys_state(struct task_struct *task, int subsys_id)
667{ 707{
@@ -674,12 +714,14 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
674 return task_subsys_state(task, subsys_id)->cgroup; 714 return task_subsys_state(task, subsys_id)->cgroup;
675} 715}
676 716
717struct cgroup *cgroup_next_sibling(struct cgroup *pos);
718
677/** 719/**
678 * cgroup_for_each_child - iterate through children of a cgroup 720 * cgroup_for_each_child - iterate through children of a cgroup
679 * @pos: the cgroup * to use as the loop cursor 721 * @pos: the cgroup * to use as the loop cursor
680 * @cgroup: cgroup whose children to walk 722 * @cgrp: cgroup whose children to walk
681 * 723 *
682 * Walk @cgroup's children. Must be called under rcu_read_lock(). A child 724 * Walk @cgrp's children. Must be called under rcu_read_lock(). A child
683 * cgroup which hasn't finished ->css_online() or already has finished 725 * cgroup which hasn't finished ->css_online() or already has finished
684 * ->css_offline() may show up during traversal and it's each subsystem's 726 * ->css_offline() may show up during traversal and it's each subsystem's
685 * responsibility to verify that each @pos is alive. 727 * responsibility to verify that each @pos is alive.
@@ -687,9 +729,15 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
687 * If a subsystem synchronizes against the parent in its ->css_online() and 729 * If a subsystem synchronizes against the parent in its ->css_online() and
688 * before starting iterating, a cgroup which finished ->css_online() is 730 * before starting iterating, a cgroup which finished ->css_online() is
689 * guaranteed to be visible in the future iterations. 731 * guaranteed to be visible in the future iterations.
732 *
733 * It is allowed to temporarily drop RCU read lock during iteration. The
734 * caller is responsible for ensuring that @pos remains accessible until
735 * the start of the next iteration by, for example, bumping the css refcnt.
690 */ 736 */
691#define cgroup_for_each_child(pos, cgroup) \ 737#define cgroup_for_each_child(pos, cgrp) \
692 list_for_each_entry_rcu(pos, &(cgroup)->children, sibling) 738 for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \
739 struct cgroup, sibling); \
740 (pos); (pos) = cgroup_next_sibling((pos)))
693 741
694struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 742struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
695 struct cgroup *cgroup); 743 struct cgroup *cgroup);
@@ -748,6 +796,10 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
748 * Alternatively, a subsystem may choose to use a single global lock to 796 * Alternatively, a subsystem may choose to use a single global lock to
749 * synchronize ->css_online() and ->css_offline() against tree-walking 797 * synchronize ->css_online() and ->css_offline() against tree-walking
750 * operations. 798 * operations.
799 *
800 * It is allowed to temporarily drop RCU read lock during iteration. The
801 * caller is responsible for ensuring that @pos remains accessible until
802 * the start of the next iteration by, for example, bumping the css refcnt.
751 */ 803 */
752#define cgroup_for_each_descendant_pre(pos, cgroup) \ 804#define cgroup_for_each_descendant_pre(pos, cgroup) \
753 for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ 805 for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \
@@ -771,7 +823,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
771 823
772/* A cgroup_iter should be treated as an opaque object */ 824/* A cgroup_iter should be treated as an opaque object */
773struct cgroup_iter { 825struct cgroup_iter {
774 struct list_head *cg_link; 826 struct list_head *cset_link;
775 struct list_head *task; 827 struct list_head *task;
776}; 828};
777 829
@@ -827,7 +879,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
827 879
828/* Get id and depth of css */ 880/* Get id and depth of css */
829unsigned short css_id(struct cgroup_subsys_state *css); 881unsigned short css_id(struct cgroup_subsys_state *css);
830unsigned short css_depth(struct cgroup_subsys_state *css);
831struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); 882struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
832 883
833#else /* !CONFIG_CGROUPS */ 884#else /* !CONFIG_CGROUPS */
@@ -838,8 +889,6 @@ static inline void cgroup_fork(struct task_struct *p) {}
838static inline void cgroup_post_fork(struct task_struct *p) {} 889static inline void cgroup_post_fork(struct task_struct *p) {}
839static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 890static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
840 891
841static inline void cgroup_lock(void) {}
842static inline void cgroup_unlock(void) {}
843static inline int cgroupstats_build(struct cgroupstats *stats, 892static inline int cgroupstats_build(struct cgroupstats *stats,
844 struct dentry *dentry) 893 struct dentry *dentry)
845{ 894{