aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/memory.txt17
-rw-r--r--Documentation/cgroups/resource_counter.txt197
-rw-r--r--include/linux/res_counter.h223
-rw-r--r--init/Kconfig6
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/res_counter.c211
6 files changed, 8 insertions, 647 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index f624727ab404..67613ff0270c 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -116,16 +116,16 @@ The memory controller is the first controller developed.
116 116
1172.1. Design 1172.1. Design
118 118
119The core of the design is a counter called the res_counter. The res_counter 119The core of the design is a counter called the page_counter. The
120tracks the current memory usage and limit of the group of processes associated 120page_counter tracks the current memory usage and limit of the group of
121with the controller. Each cgroup has a memory controller specific data 121processes associated with the controller. Each cgroup has a memory controller
122structure (mem_cgroup) associated with it. 122specific data structure (mem_cgroup) associated with it.
123 123
1242.2. Accounting 1242.2. Accounting
125 125
126 +--------------------+ 126 +--------------------+
127 | mem_cgroup | 127 | mem_cgroup |
128 | (res_counter) | 128 | (page_counter) |
129 +--------------------+ 129 +--------------------+
130 / ^ \ 130 / ^ \
131 / | \ 131 / | \
@@ -352,9 +352,8 @@ set:
3520. Configuration 3520. Configuration
353 353
354a. Enable CONFIG_CGROUPS 354a. Enable CONFIG_CGROUPS
355b. Enable CONFIG_RESOURCE_COUNTERS 355b. Enable CONFIG_MEMCG
356c. Enable CONFIG_MEMCG 356c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
357d. Enable CONFIG_MEMCG_SWAP (to use swap extension)
358d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 357d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
359 358
3601. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) 3591. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
deleted file mode 100644
index 762ca54eb929..000000000000
--- a/Documentation/cgroups/resource_counter.txt
+++ /dev/null
@@ -1,197 +0,0 @@
1
2 The Resource Counter
3
4The resource counter, declared at include/linux/res_counter.h,
5is supposed to facilitate the resource management by controllers
6by providing common stuff for accounting.
7
8This "stuff" includes the res_counter structure and routines
9to work with it.
10
11
12
131. Crucial parts of the res_counter structure
14
15 a. unsigned long long usage
16
17 The usage value shows the amount of a resource that is consumed
18 by a group at a given time. The units of measurement should be
19 determined by the controller that uses this counter. E.g. it can
20 be bytes, items or any other unit the controller operates on.
21
22 b. unsigned long long max_usage
23
24 The maximal value of the usage over time.
25
26 This value is useful when gathering statistical information about
27 the particular group, as it shows the actual resource requirements
28 for a particular group, not just some usage snapshot.
29
30 c. unsigned long long limit
31
32 The maximal allowed amount of resource to consume by the group. In
33 case the group requests for more resources, so that the usage value
34 would exceed the limit, the resource allocation is rejected (see
35 the next section).
36
37 d. unsigned long long failcnt
38
39 The failcnt stands for "failures counter". This is the number of
40 resource allocation attempts that failed.
41
42 c. spinlock_t lock
43
44 Protects changes of the above values.
45
46
47
482. Basic accounting routines
49
50 a. void res_counter_init(struct res_counter *rc,
51 struct res_counter *rc_parent)
52
53 Initializes the resource counter. As usual, should be the first
54 routine called for a new counter.
55
56 The struct res_counter *parent can be used to define a hierarchical
57 child -> parent relationship directly in the res_counter structure,
58 NULL can be used to define no relationship.
59
60 c. int res_counter_charge(struct res_counter *rc, unsigned long val,
61 struct res_counter **limit_fail_at)
62
63 When a resource is about to be allocated it has to be accounted
64 with the appropriate resource counter (controller should determine
65 which one to use on its own). This operation is called "charging".
66
67 This is not very important which operation - resource allocation
68 or charging - is performed first, but
69 * if the allocation is performed first, this may create a
70 temporary resource over-usage by the time resource counter is
71 charged;
72 * if the charging is performed first, then it should be uncharged
73 on error path (if the one is called).
74
75 If the charging fails and a hierarchical dependency exists, the
76 limit_fail_at parameter is set to the particular res_counter element
77 where the charging failed.
78
79 d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val)
80
81 When a resource is released (freed) it should be de-accounted
82 from the resource counter it was accounted to. This is called
83 "uncharging". The return value of this function indicate the amount
84 of charges still present in the counter.
85
86 The _locked routines imply that the res_counter->lock is taken.
87
88 e. u64 res_counter_uncharge_until
89 (struct res_counter *rc, struct res_counter *top,
90 unsigned long val)
91
92 Almost same as res_counter_uncharge() but propagation of uncharge
93 stops when rc == top. This is useful when kill a res_counter in
94 child cgroup.
95
96 2.1 Other accounting routines
97
98 There are more routines that may help you with common needs, like
99 checking whether the limit is reached or resetting the max_usage
100 value. They are all declared in include/linux/res_counter.h.
101
102
103
1043. Analyzing the resource counter registrations
105
106 a. If the failcnt value constantly grows, this means that the counter's
107 limit is too tight. Either the group is misbehaving and consumes too
108 many resources, or the configuration is not suitable for the group
109 and the limit should be increased.
110
111 b. The max_usage value can be used to quickly tune the group. One may
112 set the limits to maximal values and either load the container with
113 a common pattern or leave one for a while. After this the max_usage
114 value shows the amount of memory the container would require during
115 its common activity.
116
117 Setting the limit a bit above this value gives a pretty good
118 configuration that works in most of the cases.
119
120 c. If the max_usage is much less than the limit, but the failcnt value
121 is growing, then the group tries to allocate a big chunk of resource
122 at once.
123
124 d. If the max_usage is much less than the limit, but the failcnt value
125 is 0, then this group is given too high limit, that it does not
126 require. It is better to lower the limit a bit leaving more resource
127 for other groups.
128
129
130
1314. Communication with the control groups subsystem (cgroups)
132
133All the resource controllers that are using cgroups and resource counters
134should provide files (in the cgroup filesystem) to work with the resource
135counter fields. They are recommended to adhere to the following rules:
136
137 a. File names
138
139 Field name File name
140 ---------------------------------------------------
141 usage usage_in_<unit_of_measurement>
142 max_usage max_usage_in_<unit_of_measurement>
143 limit limit_in_<unit_of_measurement>
144 failcnt failcnt
145 lock no file :)
146
147 b. Reading from file should show the corresponding field value in the
148 appropriate format.
149
150 c. Writing to file
151
152 Field Expected behavior
153 ----------------------------------
154 usage prohibited
155 max_usage reset to usage
156 limit set the limit
157 failcnt reset to zero
158
159
160
1615. Usage example
162
163 a. Declare a task group (take a look at cgroups subsystem for this) and
164 fold a res_counter into it
165
166 struct my_group {
167 struct res_counter res;
168
169 <other fields>
170 }
171
172 b. Put hooks in resource allocation/release paths
173
174 int alloc_something(...)
175 {
176 if (res_counter_charge(res_counter_ptr, amount) < 0)
177 return -ENOMEM;
178
179 <allocate the resource and return to the caller>
180 }
181
182 void release_something(...)
183 {
184 res_counter_uncharge(res_counter_ptr, amount);
185
186 <release the resource>
187 }
188
189 In order to keep the usage value self-consistent, both the
190 "res_counter_ptr" and the "amount" in release_something() should be
191 the same as they were in the alloc_something() when the releasing
192 resource was allocated.
193
194 c. Provide the way to read res_counter values and set them (the cgroups
195 still can help with it).
196
197 c. Compile and run :)
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
deleted file mode 100644
index 56b7bc32db4f..000000000000
--- a/include/linux/res_counter.h
+++ /dev/null
@@ -1,223 +0,0 @@
1#ifndef __RES_COUNTER_H__
2#define __RES_COUNTER_H__
3
4/*
5 * Resource Counters
6 * Contain common data types and routines for resource accounting
7 *
8 * Copyright 2007 OpenVZ SWsoft Inc
9 *
10 * Author: Pavel Emelianov <xemul@openvz.org>
11 *
12 * See Documentation/cgroups/resource_counter.txt for more
13 * info about what this counter is.
14 */
15
16#include <linux/spinlock.h>
17#include <linux/errno.h>
18
19/*
20 * The core object. the cgroup that wishes to account for some
21 * resource may include this counter into its structures and use
22 * the helpers described beyond
23 */
24
25struct res_counter {
26 /*
27 * the current resource consumption level
28 */
29 unsigned long long usage;
30 /*
31 * the maximal value of the usage from the counter creation
32 */
33 unsigned long long max_usage;
34 /*
35 * the limit that usage cannot exceed
36 */
37 unsigned long long limit;
38 /*
39 * the limit that usage can be exceed
40 */
41 unsigned long long soft_limit;
42 /*
43 * the number of unsuccessful attempts to consume the resource
44 */
45 unsigned long long failcnt;
46 /*
47 * the lock to protect all of the above.
48 * the routines below consider this to be IRQ-safe
49 */
50 spinlock_t lock;
51 /*
52 * Parent counter, used for hierarchial resource accounting
53 */
54 struct res_counter *parent;
55};
56
57#define RES_COUNTER_MAX ULLONG_MAX
58
59/**
60 * Helpers to interact with userspace
61 * res_counter_read_u64() - returns the value of the specified member.
62 * res_counter_read/_write - put/get the specified fields from the
63 * res_counter struct to/from the user
64 *
65 * @counter: the counter in question
66 * @member: the field to work with (see RES_xxx below)
67 * @buf: the buffer to opeate on,...
68 * @nbytes: its size...
69 * @pos: and the offset.
70 */
71
72u64 res_counter_read_u64(struct res_counter *counter, int member);
73
74ssize_t res_counter_read(struct res_counter *counter, int member,
75 const char __user *buf, size_t nbytes, loff_t *pos,
76 int (*read_strategy)(unsigned long long val, char *s));
77
78int res_counter_memparse_write_strategy(const char *buf,
79 unsigned long long *res);
80
81/*
82 * the field descriptors. one for each member of res_counter
83 */
84
85enum {
86 RES_USAGE,
87 RES_MAX_USAGE,
88 RES_LIMIT,
89 RES_FAILCNT,
90 RES_SOFT_LIMIT,
91};
92
93/*
94 * helpers for accounting
95 */
96
97void res_counter_init(struct res_counter *counter, struct res_counter *parent);
98
99/*
100 * charge - try to consume more resource.
101 *
102 * @counter: the counter
103 * @val: the amount of the resource. each controller defines its own
104 * units, e.g. numbers, bytes, Kbytes, etc
105 *
106 * returns 0 on success and <0 if the counter->usage will exceed the
107 * counter->limit
108 *
109 * charge_nofail works the same, except that it charges the resource
110 * counter unconditionally, and returns < 0 if the after the current
111 * charge we are over limit.
112 */
113
114int __must_check res_counter_charge(struct res_counter *counter,
115 unsigned long val, struct res_counter **limit_fail_at);
116int res_counter_charge_nofail(struct res_counter *counter,
117 unsigned long val, struct res_counter **limit_fail_at);
118
119/*
120 * uncharge - tell that some portion of the resource is released
121 *
122 * @counter: the counter
123 * @val: the amount of the resource
124 *
125 * these calls check for usage underflow and show a warning on the console
126 *
127 * returns the total charges still present in @counter.
128 */
129
130u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
131
132u64 res_counter_uncharge_until(struct res_counter *counter,
133 struct res_counter *top,
134 unsigned long val);
135/**
136 * res_counter_margin - calculate chargeable space of a counter
137 * @cnt: the counter
138 *
139 * Returns the difference between the hard limit and the current usage
140 * of resource counter @cnt.
141 */
142static inline unsigned long long res_counter_margin(struct res_counter *cnt)
143{
144 unsigned long long margin;
145 unsigned long flags;
146
147 spin_lock_irqsave(&cnt->lock, flags);
148 if (cnt->limit > cnt->usage)
149 margin = cnt->limit - cnt->usage;
150 else
151 margin = 0;
152 spin_unlock_irqrestore(&cnt->lock, flags);
153 return margin;
154}
155
156/**
157 * Get the difference between the usage and the soft limit
158 * @cnt: The counter
159 *
160 * Returns 0 if usage is less than or equal to soft limit
161 * The difference between usage and soft limit, otherwise.
162 */
163static inline unsigned long long
164res_counter_soft_limit_excess(struct res_counter *cnt)
165{
166 unsigned long long excess;
167 unsigned long flags;
168
169 spin_lock_irqsave(&cnt->lock, flags);
170 if (cnt->usage <= cnt->soft_limit)
171 excess = 0;
172 else
173 excess = cnt->usage - cnt->soft_limit;
174 spin_unlock_irqrestore(&cnt->lock, flags);
175 return excess;
176}
177
178static inline void res_counter_reset_max(struct res_counter *cnt)
179{
180 unsigned long flags;
181
182 spin_lock_irqsave(&cnt->lock, flags);
183 cnt->max_usage = cnt->usage;
184 spin_unlock_irqrestore(&cnt->lock, flags);
185}
186
187static inline void res_counter_reset_failcnt(struct res_counter *cnt)
188{
189 unsigned long flags;
190
191 spin_lock_irqsave(&cnt->lock, flags);
192 cnt->failcnt = 0;
193 spin_unlock_irqrestore(&cnt->lock, flags);
194}
195
196static inline int res_counter_set_limit(struct res_counter *cnt,
197 unsigned long long limit)
198{
199 unsigned long flags;
200 int ret = -EBUSY;
201
202 spin_lock_irqsave(&cnt->lock, flags);
203 if (cnt->usage <= limit) {
204 cnt->limit = limit;
205 ret = 0;
206 }
207 spin_unlock_irqrestore(&cnt->lock, flags);
208 return ret;
209}
210
211static inline int
212res_counter_set_soft_limit(struct res_counter *cnt,
213 unsigned long long soft_limit)
214{
215 unsigned long flags;
216
217 spin_lock_irqsave(&cnt->lock, flags);
218 cnt->soft_limit = soft_limit;
219 spin_unlock_irqrestore(&cnt->lock, flags);
220 return 0;
221}
222
223#endif
diff --git a/init/Kconfig b/init/Kconfig
index a60d1442d1df..1761c72bc1a0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -972,12 +972,6 @@ config CGROUP_CPUACCT
972 Provides a simple Resource Controller for monitoring the 972 Provides a simple Resource Controller for monitoring the
973 total CPU consumed by the tasks in a cgroup. 973 total CPU consumed by the tasks in a cgroup.
974 974
975config RESOURCE_COUNTERS
976 bool "Resource counters"
977 help
978 This option enables controller independent resource accounting
979 infrastructure that works with cgroups.
980
981config PAGE_COUNTER 975config PAGE_COUNTER
982 bool 976 bool
983 977
diff --git a/kernel/Makefile b/kernel/Makefile
index 17ea6d4a9a24..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
57obj-$(CONFIG_USER_NS) += user_namespace.o 57obj-$(CONFIG_USER_NS) += user_namespace.o
58obj-$(CONFIG_PID_NS) += pid_namespace.o 58obj-$(CONFIG_PID_NS) += pid_namespace.o
59obj-$(CONFIG_IKCONFIG) += configs.o 59obj-$(CONFIG_IKCONFIG) += configs.o
60obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
61obj-$(CONFIG_SMP) += stop_machine.o 60obj-$(CONFIG_SMP) += stop_machine.o
62obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 61obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
63obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 62obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
1/*
2 * resource cgroups
3 *
4 * Copyright 2007 OpenVZ SWsoft Inc
5 *
6 * Author: Pavel Emelianov <xemul@openvz.org>
7 *
8 */
9
10#include <linux/types.h>
11#include <linux/parser.h>
12#include <linux/fs.h>
13#include <linux/res_counter.h>
14#include <linux/uaccess.h>
15#include <linux/mm.h>
16
17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{
19 spin_lock_init(&counter->lock);
20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent;
23}
24
25static u64 res_counter_uncharge_locked(struct res_counter *counter,
26 unsigned long val)
27{
28 if (WARN_ON(counter->usage < val))
29 val = counter->usage;
30
31 counter->usage -= val;
32 return counter->usage;
33}
34
35static int res_counter_charge_locked(struct res_counter *counter,
36 unsigned long val, bool force)
37{
38 int ret = 0;
39
40 if (counter->usage + val > counter->limit) {
41 counter->failcnt++;
42 ret = -ENOMEM;
43 if (!force)
44 return ret;
45 }
46
47 counter->usage += val;
48 if (counter->usage > counter->max_usage)
49 counter->max_usage = counter->usage;
50 return ret;
51}
52
53static int __res_counter_charge(struct res_counter *counter, unsigned long val,
54 struct res_counter **limit_fail_at, bool force)
55{
56 int ret, r;
57 unsigned long flags;
58 struct res_counter *c, *u;
59
60 r = ret = 0;
61 *limit_fail_at = NULL;
62 local_irq_save(flags);
63 for (c = counter; c != NULL; c = c->parent) {
64 spin_lock(&c->lock);
65 r = res_counter_charge_locked(c, val, force);
66 spin_unlock(&c->lock);
67 if (r < 0 && !ret) {
68 ret = r;
69 *limit_fail_at = c;
70 if (!force)
71 break;
72 }
73 }
74
75 if (ret < 0 && !force) {
76 for (u = counter; u != c; u = u->parent) {
77 spin_lock(&u->lock);
78 res_counter_uncharge_locked(u, val);
79 spin_unlock(&u->lock);
80 }
81 }
82 local_irq_restore(flags);
83
84 return ret;
85}
86
87int res_counter_charge(struct res_counter *counter, unsigned long val,
88 struct res_counter **limit_fail_at)
89{
90 return __res_counter_charge(counter, val, limit_fail_at, false);
91}
92
93int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
94 struct res_counter **limit_fail_at)
95{
96 return __res_counter_charge(counter, val, limit_fail_at, true);
97}
98
99u64 res_counter_uncharge_until(struct res_counter *counter,
100 struct res_counter *top,
101 unsigned long val)
102{
103 unsigned long flags;
104 struct res_counter *c;
105 u64 ret = 0;
106
107 local_irq_save(flags);
108 for (c = counter; c != top; c = c->parent) {
109 u64 r;
110 spin_lock(&c->lock);
111 r = res_counter_uncharge_locked(c, val);
112 if (c == counter)
113 ret = r;
114 spin_unlock(&c->lock);
115 }
116 local_irq_restore(flags);
117 return ret;
118}
119
120u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
121{
122 return res_counter_uncharge_until(counter, NULL, val);
123}
124
125static inline unsigned long long *
126res_counter_member(struct res_counter *counter, int member)
127{
128 switch (member) {
129 case RES_USAGE:
130 return &counter->usage;
131 case RES_MAX_USAGE:
132 return &counter->max_usage;
133 case RES_LIMIT:
134 return &counter->limit;
135 case RES_FAILCNT:
136 return &counter->failcnt;
137 case RES_SOFT_LIMIT:
138 return &counter->soft_limit;
139 };
140
141 BUG();
142 return NULL;
143}
144
145ssize_t res_counter_read(struct res_counter *counter, int member,
146 const char __user *userbuf, size_t nbytes, loff_t *pos,
147 int (*read_strategy)(unsigned long long val, char *st_buf))
148{
149 unsigned long long *val;
150 char buf[64], *s;
151
152 s = buf;
153 val = res_counter_member(counter, member);
154 if (read_strategy)
155 s += read_strategy(*val, s);
156 else
157 s += sprintf(s, "%llu\n", *val);
158 return simple_read_from_buffer((void __user *)userbuf, nbytes,
159 pos, buf, s - buf);
160}
161
162#if BITS_PER_LONG == 32
163u64 res_counter_read_u64(struct res_counter *counter, int member)
164{
165 unsigned long flags;
166 u64 ret;
167
168 spin_lock_irqsave(&counter->lock, flags);
169 ret = *res_counter_member(counter, member);
170 spin_unlock_irqrestore(&counter->lock, flags);
171
172 return ret;
173}
174#else
175u64 res_counter_read_u64(struct res_counter *counter, int member)
176{
177 return *res_counter_member(counter, member);
178}
179#endif
180
181int res_counter_memparse_write_strategy(const char *buf,
182 unsigned long long *resp)
183{
184 char *end;
185 unsigned long long res;
186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') {
189 int rc = kstrtoull(buf + 1, 10, &res);
190
191 if (rc)
192 return rc;
193 if (res != 1)
194 return -EINVAL;
195 *resp = RES_COUNTER_MAX;
196 return 0;
197 }
198
199 res = memparse(buf, &end);
200 if (*end != '\0')
201 return -EINVAL;
202
203 if (PAGE_ALIGN(res) >= res)
204 res = PAGE_ALIGN(res);
205 else
206 res = RES_COUNTER_MAX;
207
208 *resp = res;
209
210 return 0;
211}