aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2007-10-19 02:39:30 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:36 -0400
commitddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (patch)
tree0881a031e669582f819d572339e955b04abfc3d2 /include
parent55a230aae650157720becc09cadb7d10efbf5013 (diff)
Task Control Groups: basic task cgroup framework
Generic Process Control Groups -------------------------- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy cgroups, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. This patchset provides a framework for tracking and grouping processes into arbitrary "cgroups" and assigning arbitrary state to those groupings, in order to control the behaviour of the cgroup as an aggregate. The intention is that the various resource management and virtualization/cgroup efforts can also become task cgroup clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel This patch: Add the main task cgroups framework - the cgroup filesystem, and the basic structures for tracking membership and associating subsystem state objects to tasks. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/cgroup.h214
-rw-r--r--include/linux/cgroup_subsys.h10
-rw-r--r--include/linux/magic.h1
-rw-r--r--include/linux/sched.h34
4 files changed, 258 insertions, 1 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
new file mode 100644
index 000000000000..60735dcf427a
--- /dev/null
+++ b/include/linux/cgroup.h
@@ -0,0 +1,214 @@
1#ifndef _LINUX_CGROUP_H
2#define _LINUX_CGROUP_H
3/*
4 * cgroup interface
5 *
6 * Copyright (C) 2003 BULL SA
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 *
9 */
10
11#include <linux/sched.h>
12#include <linux/kref.h>
13#include <linux/cpumask.h>
14#include <linux/nodemask.h>
15#include <linux/rcupdate.h>
16
17#ifdef CONFIG_CGROUPS
18
19struct cgroupfs_root;
20struct cgroup_subsys;
21struct inode;
22
23extern int cgroup_init_early(void);
24extern int cgroup_init(void);
25extern void cgroup_init_smp(void);
26extern void cgroup_lock(void);
27extern void cgroup_unlock(void);
28
29/* Per-subsystem/per-cgroup state maintained by the system. */
30struct cgroup_subsys_state {
31 /* The cgroup that this subsystem is attached to. Useful
32 * for subsystems that want to know about the cgroup
33 * hierarchy structure */
34 struct cgroup *cgroup;
35
36 /* State maintained by the cgroup system to allow
37 * subsystems to be "busy". Should be accessed via css_get()
38 * and css_put() */
39
40 atomic_t refcnt;
41
42 unsigned long flags;
43};
44
45/* bits in struct cgroup_subsys_state flags field */
46enum {
47 CSS_ROOT, /* This CSS is the root of the subsystem */
48};
49
50/*
51 * Call css_get() to hold a reference on the cgroup;
52 *
53 */
54
55static inline void css_get(struct cgroup_subsys_state *css)
56{
57 /* We don't need to reference count the root state */
58 if (!test_bit(CSS_ROOT, &css->flags))
59 atomic_inc(&css->refcnt);
60}
61/*
62 * css_put() should be called to release a reference taken by
63 * css_get()
64 */
65
66static inline void css_put(struct cgroup_subsys_state *css)
67{
68 if (!test_bit(CSS_ROOT, &css->flags))
69 atomic_dec(&css->refcnt);
70}
71
72struct cgroup {
73 unsigned long flags; /* "unsigned long" so bitops work */
74
75 /* count users of this cgroup. >0 means busy, but doesn't
76 * necessarily indicate the number of tasks in the
77 * cgroup */
78 atomic_t count;
79
80 /*
81 * We link our 'sibling' struct into our parent's 'children'.
82 * Our children link their 'sibling' into our 'children'.
83 */
84 struct list_head sibling; /* my parent's children */
85 struct list_head children; /* my children */
86
87 struct cgroup *parent; /* my parent */
88 struct dentry *dentry; /* cgroup fs entry */
89
90 /* Private pointers for each registered subsystem */
91 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
92
93 struct cgroupfs_root *root;
94 struct cgroup *top_cgroup;
95};
96
97/* struct cftype:
98 *
99 * The files in the cgroup filesystem mostly have a very simple read/write
100 * handling, some common function will take care of it. Nevertheless some cases
101 * (read tasks) are special and therefore I define this structure for every
102 * kind of file.
103 *
104 *
105 * When reading/writing to a file:
106 * - the cgroup to use in file->f_dentry->d_parent->d_fsdata
107 * - the 'cftype' of the file is file->f_dentry->d_fsdata
108 */
109
110#define MAX_CFTYPE_NAME 64
111struct cftype {
112 /* By convention, the name should begin with the name of the
113 * subsystem, followed by a period */
114 char name[MAX_CFTYPE_NAME];
115 int private;
116 int (*open) (struct inode *inode, struct file *file);
117 ssize_t (*read) (struct cgroup *cont, struct cftype *cft,
118 struct file *file,
119 char __user *buf, size_t nbytes, loff_t *ppos);
120 /*
121 * read_uint() is a shortcut for the common case of returning a
122 * single integer. Use it in place of read()
123 */
124 u64 (*read_uint) (struct cgroup *cont, struct cftype *cft);
125 ssize_t (*write) (struct cgroup *cont, struct cftype *cft,
126 struct file *file,
127 const char __user *buf, size_t nbytes, loff_t *ppos);
128 int (*release) (struct inode *inode, struct file *file);
129};
130
131/* Add a new file to the given cgroup directory. Should only be
132 * called by subsystems from within a populate() method */
133int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
134 const struct cftype *cft);
135
136/* Add a set of new files to the given cgroup directory. Should
137 * only be called by subsystems from within a populate() method */
138int cgroup_add_files(struct cgroup *cont,
139 struct cgroup_subsys *subsys,
140 const struct cftype cft[],
141 int count);
142
143int cgroup_is_removed(const struct cgroup *cont);
144
145int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
146
147/* Return true if the cgroup is a descendant of the current cgroup */
148int cgroup_is_descendant(const struct cgroup *cont);
149
150/* Control Group subsystem type. See Documentation/cgroups.txt for details */
151
152struct cgroup_subsys {
153 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
154 struct cgroup *cont);
155 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cont);
156 int (*can_attach)(struct cgroup_subsys *ss,
157 struct cgroup *cont, struct task_struct *tsk);
158 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cont,
159 struct cgroup *old_cont, struct task_struct *tsk);
160 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
161 void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
162 int (*populate)(struct cgroup_subsys *ss,
163 struct cgroup *cont);
164 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
165 int subsys_id;
166 int active;
167 int early_init;
168#define MAX_CGROUP_TYPE_NAMELEN 32
169 const char *name;
170
171 /* Protected by RCU */
172 struct cgroupfs_root *root;
173
174 struct list_head sibling;
175
176 void *private;
177};
178
179#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
180#include <linux/cgroup_subsys.h>
181#undef SUBSYS
182
183static inline struct cgroup_subsys_state *cgroup_subsys_state(
184 struct cgroup *cont, int subsys_id)
185{
186 return cont->subsys[subsys_id];
187}
188
189static inline struct cgroup_subsys_state *task_subsys_state(
190 struct task_struct *task, int subsys_id)
191{
192 return rcu_dereference(task->cgroups.subsys[subsys_id]);
193}
194
195static inline struct cgroup* task_cgroup(struct task_struct *task,
196 int subsys_id)
197{
198 return task_subsys_state(task, subsys_id)->cgroup;
199}
200
201int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
202
203#else /* !CONFIG_CGROUPS */
204
205static inline int cgroup_init_early(void) { return 0; }
206static inline int cgroup_init(void) { return 0; }
207static inline void cgroup_init_smp(void) {}
208
209static inline void cgroup_lock(void) {}
210static inline void cgroup_unlock(void) {}
211
212#endif /* !CONFIG_CGROUPS */
213
214#endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
new file mode 100644
index 000000000000..f8eddbbcad9a
--- /dev/null
+++ b/include/linux/cgroup_subsys.h
@@ -0,0 +1,10 @@
1/* Add subsystem definitions of the form SUBSYS(<name>) in this
2 * file. Surround each one by a line of comment markers so that
3 * patches don't collide
4 */
5
6/* */
7
8/* */
9
10/* */
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 722d4755060f..1fa0c2ce4dec 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -37,6 +37,7 @@
37 37
38#define SMB_SUPER_MAGIC 0x517B 38#define SMB_SUPER_MAGIC 0x517B
39#define USBDEVICE_SUPER_MAGIC 0x9fa2 39#define USBDEVICE_SUPER_MAGIC 0x9fa2
40#define CGROUP_SUPER_MAGIC 0x27e0eb
40 41
41#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA 42#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA
42#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA 43#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 10a83d8d5775..af2ed4bae678 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -894,6 +894,34 @@ struct sched_entity {
894#endif 894#endif
895}; 895};
896 896
897#ifdef CONFIG_CGROUPS
898
899#define SUBSYS(_x) _x ## _subsys_id,
900enum cgroup_subsys_id {
901#include <linux/cgroup_subsys.h>
902 CGROUP_SUBSYS_COUNT
903};
904#undef SUBSYS
905
906/* A css_set is a structure holding pointers to a set of
907 * cgroup_subsys_state objects.
908 */
909
910struct css_set {
911
912 /* Set of subsystem states, one for each subsystem. NULL for
913 * subsystems that aren't part of this hierarchy. These
914 * pointers reduce the number of dereferences required to get
915 * from a task to its state for a given cgroup, but result
916 * in increased space usage if tasks are in wildly different
917 * groupings across different hierarchies. This array is
918 * immutable after creation */
919 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
920
921};
922
923#endif /* CONFIG_CGROUPS */
924
897struct task_struct { 925struct task_struct {
898 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 926 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
899 void *stack; 927 void *stack;
@@ -1130,6 +1158,9 @@ struct task_struct {
1130 int cpuset_mems_generation; 1158 int cpuset_mems_generation;
1131 int cpuset_mem_spread_rotor; 1159 int cpuset_mem_spread_rotor;
1132#endif 1160#endif
1161#ifdef CONFIG_CGROUPS
1162 struct css_set cgroups;
1163#endif
1133#ifdef CONFIG_FUTEX 1164#ifdef CONFIG_FUTEX
1134 struct robust_list_head __user *robust_list; 1165 struct robust_list_head __user *robust_list;
1135#ifdef CONFIG_COMPAT 1166#ifdef CONFIG_COMPAT
@@ -1625,7 +1656,8 @@ static inline int thread_group_empty(struct task_struct *p)
1625/* 1656/*
1626 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring 1657 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
1627 * subscriptions and synchronises with wait4(). Also used in procfs. Also 1658 * subscriptions and synchronises with wait4(). Also used in procfs. Also
1628 * pins the final release of task.io_context. Also protects ->cpuset. 1659 * pins the final release of task.io_context. Also protects ->cpuset and
1660 * ->cgroup.subsys[].
1629 * 1661 *
1630 * Nests both inside and outside of read_lock(&tasklist_lock). 1662 * Nests both inside and outside of read_lock(&tasklist_lock).
1631 * It must not be nested with write_lock_irq(&tasklist_lock), 1663 * It must not be nested with write_lock_irq(&tasklist_lock),