summaryrefslogtreecommitdiffstats
path: root/kernel/cgroup
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2017-09-25 11:12:05 -0400
committerTejun Heo <tj@kernel.org>2017-09-25 11:12:05 -0400
commit041cd640b2f3c5607171c59d8712b503659d21f7 (patch)
tree2979112393aefa10e23245ae95f481763280dd6f /kernel/cgroup
parentd2cc5ed6949085cfba30ec5228816cf6eb1d02b9 (diff)
cgroup: Implement cgroup2 basic CPU usage accounting
In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Waiman Long <longman@redhat.com> Cc: Roman Gushchin <guro@fb.com>
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/Makefile2
-rw-r--r--kernel/cgroup/cgroup-internal.h8
-rw-r--r--kernel/cgroup/cgroup.c24
-rw-r--r--kernel/cgroup/stat.c334
4 files changed, 365 insertions, 3 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..0acee616e06c 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
1obj-y := cgroup.o namespace.o cgroup-v1.o 1obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
2 2
3obj-$(CONFIG_CGROUP_FREEZER) += freezer.o 3obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
4obj-$(CONFIG_CGROUP_PIDS) += pids.o 4obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5151ff256c29..fa642c99586a 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -200,6 +200,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
200int cgroup_task_count(const struct cgroup *cgrp); 200int cgroup_task_count(const struct cgroup *cgrp);
201 201
202/* 202/*
203 * stat.c
204 */
205void cgroup_stat_flush(struct cgroup *cgrp);
206int cgroup_stat_init(struct cgroup *cgrp);
207void cgroup_stat_exit(struct cgroup *cgrp);
208void cgroup_stat_boot(void);
209
210/*
203 * namespace.c 211 * namespace.c
204 */ 212 */
205extern const struct proc_ns_operations cgroupns_operations; 213extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..d036625556c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
142}; 142};
143#undef SUBSYS 143#undef SUBSYS
144 144
145static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
146
145/* 147/*
146 * The default hierarchy, reserved for the subsystems that are otherwise 148 * The default hierarchy, reserved for the subsystems that are otherwise
147 * unattached - it never has more than a single cgroup, and all tasks are 149 * unattached - it never has more than a single cgroup, and all tasks are
148 * part of that cgroup. 150 * part of that cgroup.
149 */ 151 */
150struct cgroup_root cgrp_dfl_root; 152struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
151EXPORT_SYMBOL_GPL(cgrp_dfl_root); 153EXPORT_SYMBOL_GPL(cgrp_dfl_root);
152 154
153/* 155/*
@@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
3301 seq_printf(seq, "nr_dying_descendants %d\n", 3303 seq_printf(seq, "nr_dying_descendants %d\n",
3302 cgroup->nr_dying_descendants); 3304 cgroup->nr_dying_descendants);
3303 3305
3306 cgroup_stat_show_cputime(seq, "cpu.");
3307
3304 return 0; 3308 return 0;
3305} 3309}
3306 3310
@@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work)
4471 */ 4475 */
4472 cgroup_put(cgroup_parent(cgrp)); 4476 cgroup_put(cgroup_parent(cgrp));
4473 kernfs_put(cgrp->kn); 4477 kernfs_put(cgrp->kn);
4478 if (cgroup_on_dfl(cgrp))
4479 cgroup_stat_exit(cgrp);
4474 kfree(cgrp); 4480 kfree(cgrp);
4475 } else { 4481 } else {
4476 /* 4482 /*
@@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work)
4515 /* cgroup release path */ 4521 /* cgroup release path */
4516 trace_cgroup_release(cgrp); 4522 trace_cgroup_release(cgrp);
4517 4523
4524 if (cgroup_on_dfl(cgrp))
4525 cgroup_stat_flush(cgrp);
4526
4518 for (tcgrp = cgroup_parent(cgrp); tcgrp; 4527 for (tcgrp = cgroup_parent(cgrp); tcgrp;
4519 tcgrp = cgroup_parent(tcgrp)) 4528 tcgrp = cgroup_parent(tcgrp))
4520 tcgrp->nr_dying_descendants--; 4529 tcgrp->nr_dying_descendants--;
@@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4698 if (ret) 4707 if (ret)
4699 goto out_free_cgrp; 4708 goto out_free_cgrp;
4700 4709
4710 if (cgroup_on_dfl(parent)) {
4711 ret = cgroup_stat_init(cgrp);
4712 if (ret)
4713 goto out_cancel_ref;
4714 }
4715
4701 /* 4716 /*
4702 * Temporarily set the pointer to NULL, so idr_find() won't return 4717 * Temporarily set the pointer to NULL, so idr_find() won't return
4703 * a half-baked cgroup. 4718 * a half-baked cgroup.
@@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4705 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); 4720 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4706 if (cgrp->id < 0) { 4721 if (cgrp->id < 0) {
4707 ret = -ENOMEM; 4722 ret = -ENOMEM;
4708 goto out_cancel_ref; 4723 goto out_stat_exit;
4709 } 4724 }
4710 4725
4711 init_cgroup_housekeeping(cgrp); 4726 init_cgroup_housekeeping(cgrp);
@@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4754 4769
4755 return cgrp; 4770 return cgrp;
4756 4771
4772out_stat_exit:
4773 if (cgroup_on_dfl(parent))
4774 cgroup_stat_exit(cgrp);
4757out_cancel_ref: 4775out_cancel_ref:
4758 percpu_ref_exit(&cgrp->self.refcnt); 4776 percpu_ref_exit(&cgrp->self.refcnt);
4759out_free_cgrp: 4777out_free_cgrp:
@@ -5148,6 +5166,8 @@ int __init cgroup_init(void)
5148 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 5166 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5149 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); 5167 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5150 5168
5169 cgroup_stat_boot();
5170
5151 /* 5171 /*
5152 * The latency of the synchronize_sched() is too high for cgroups, 5172 * The latency of the synchronize_sched() is too high for cgroups,
5153 * avoid it at the cost of forcing all readers into the slow path. 5173 * avoid it at the cost of forcing all readers into the slow path.
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..9cce79e89320
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
1#include "cgroup-internal.h"
2
3#include <linux/sched/cputime.h>
4
5static DEFINE_MUTEX(cgroup_stat_mutex);
6static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
7
8static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
9{
10 return per_cpu_ptr(cgrp->cpu_stat, cpu);
11}
12
13/**
14 * cgroup_cpu_stat_updated - keep track of updated cpu_stat
15 * @cgrp: target cgroup
16 * @cpu: cpu on which cpu_stat was updated
17 *
18 * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
19 * cpu_stat->updated_children list. See the comment on top of
20 * cgroup_cpu_stat definition for details.
21 */
22static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
23{
24 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
25 struct cgroup *parent;
26 unsigned long flags;
27
28 /*
29 * Speculative already-on-list test. This may race leading to
30 * temporary inaccuracies, which is fine.
31 *
32 * Because @parent's updated_children is terminated with @parent
33 * instead of NULL, we can tell whether @cgrp is on the list by
34 * testing the next pointer for NULL.
35 */
36 if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
37 return;
38
39 raw_spin_lock_irqsave(cpu_lock, flags);
40
41 /* put @cgrp and all ancestors on the corresponding updated lists */
42 for (parent = cgroup_parent(cgrp); parent;
43 cgrp = parent, parent = cgroup_parent(cgrp)) {
44 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
45 struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
46
47 /*
48 * Both additions and removals are bottom-up. If a cgroup
49 * is already in the tree, all ancestors are.
50 */
51 if (cstat->updated_next)
52 break;
53
54 cstat->updated_next = pcstat->updated_children;
55 pcstat->updated_children = cgrp;
56 }
57
58 raw_spin_unlock_irqrestore(cpu_lock, flags);
59}
60
61/**
62 * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
63 * @pos: current position
64 * @root: root of the tree to traversal
65 * @cpu: target cpu
66 *
67 * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
68 * the traversal and %NULL return indicates the end. During traversal,
69 * each returned cgroup is unlinked from the tree. Must be called with the
70 * matching cgroup_cpu_stat_lock held.
71 *
72 * The only ordering guarantee is that, for a parent and a child pair
73 * covered by a given traversal, if a child is visited, its parent is
74 * guaranteed to be visited afterwards.
75 */
76static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
77 struct cgroup *root, int cpu)
78{
79 struct cgroup_cpu_stat *cstat;
80 struct cgroup *parent;
81
82 if (pos == root)
83 return NULL;
84
85 /*
86 * We're gonna walk down to the first leaf and visit/remove it. We
87 * can pick whatever unvisited node as the starting point.
88 */
89 if (!pos)
90 pos = root;
91 else
92 pos = cgroup_parent(pos);
93
94 /* walk down to the first leaf */
95 while (true) {
96 cstat = cgroup_cpu_stat(pos, cpu);
97 if (cstat->updated_children == pos)
98 break;
99 pos = cstat->updated_children;
100 }
101
102 /*
103 * Unlink @pos from the tree. As the updated_children list is
104 * singly linked, we have to walk it to find the removal point.
105 * However, due to the way we traverse, @pos will be the first
106 * child in most cases. The only exception is @root.
107 */
108 parent = cgroup_parent(pos);
109 if (parent && cstat->updated_next) {
110 struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
111 struct cgroup_cpu_stat *ncstat;
112 struct cgroup **nextp;
113
114 nextp = &pcstat->updated_children;
115 while (true) {
116 ncstat = cgroup_cpu_stat(*nextp, cpu);
117 if (*nextp == pos)
118 break;
119
120 WARN_ON_ONCE(*nextp == parent);
121 nextp = &ncstat->updated_next;
122 }
123
124 *nextp = cstat->updated_next;
125 cstat->updated_next = NULL;
126 }
127
128 return pos;
129}
130
131static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
132 struct cgroup_stat *src_stat)
133{
134 dst_stat->cputime.utime += src_stat->cputime.utime;
135 dst_stat->cputime.stime += src_stat->cputime.stime;
136 dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
137}
138
139static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
140{
141 struct cgroup *parent = cgroup_parent(cgrp);
142 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
143 struct task_cputime *last_cputime = &cstat->last_cputime;
144 struct task_cputime cputime;
145 struct cgroup_stat delta;
146 unsigned seq;
147
148 lockdep_assert_held(&cgroup_stat_mutex);
149
150 /* fetch the current per-cpu values */
151 do {
152 seq = __u64_stats_fetch_begin(&cstat->sync);
153 cputime = cstat->cputime;
154 } while (__u64_stats_fetch_retry(&cstat->sync, seq));
155
156 /* accumulate the deltas to propgate */
157 delta.cputime.utime = cputime.utime - last_cputime->utime;
158 delta.cputime.stime = cputime.stime - last_cputime->stime;
159 delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
160 last_cputime->sum_exec_runtime;
161 *last_cputime = cputime;
162
163 /* transfer the pending stat into delta */
164 cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
165 memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
166
167 /* propagate delta into the global stat and the parent's pending */
168 cgroup_stat_accumulate(&cgrp->stat, &delta);
169 if (parent)
170 cgroup_stat_accumulate(&parent->pending_stat, &delta);
171}
172
173/* see cgroup_stat_flush() */
174static void cgroup_stat_flush_locked(struct cgroup *cgrp)
175{
176 int cpu;
177
178 lockdep_assert_held(&cgroup_stat_mutex);
179
180 for_each_possible_cpu(cpu) {
181 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
182 struct cgroup *pos = NULL;
183
184 raw_spin_lock_irq(cpu_lock);
185 while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
186 cgroup_cpu_stat_flush_one(pos, cpu);
187 raw_spin_unlock_irq(cpu_lock);
188 }
189}
190
191/**
192 * cgroup_stat_flush - flush stats in @cgrp's subtree
193 * @cgrp: target cgroup
194 *
195 * Collect all per-cpu stats in @cgrp's subtree into the global counters
196 * and propagate them upwards. After this function returns, all cgroups in
197 * the subtree have up-to-date ->stat.
198 *
199 * This also gets all cgroups in the subtree including @cgrp off the
200 * ->updated_children lists.
201 */
202void cgroup_stat_flush(struct cgroup *cgrp)
203{
204 mutex_lock(&cgroup_stat_mutex);
205 cgroup_stat_flush_locked(cgrp);
206 mutex_unlock(&cgroup_stat_mutex);
207}
208
209static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
210{
211 struct cgroup_cpu_stat *cstat;
212
213 cstat = get_cpu_ptr(cgrp->cpu_stat);
214 u64_stats_update_begin(&cstat->sync);
215 return cstat;
216}
217
218static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
219 struct cgroup_cpu_stat *cstat)
220{
221 u64_stats_update_end(&cstat->sync);
222 cgroup_cpu_stat_updated(cgrp, smp_processor_id());
223 put_cpu_ptr(cstat);
224}
225
226void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
227{
228 struct cgroup_cpu_stat *cstat;
229
230 cstat = cgroup_cpu_stat_account_begin(cgrp);
231 cstat->cputime.sum_exec_runtime += delta_exec;
232 cgroup_cpu_stat_account_end(cgrp, cstat);
233}
234
235void __cgroup_account_cputime_field(struct cgroup *cgrp,
236 enum cpu_usage_stat index, u64 delta_exec)
237{
238 struct cgroup_cpu_stat *cstat;
239
240 cstat = cgroup_cpu_stat_account_begin(cgrp);
241
242 switch (index) {
243 case CPUTIME_USER:
244 case CPUTIME_NICE:
245 cstat->cputime.utime += delta_exec;
246 break;
247 case CPUTIME_SYSTEM:
248 case CPUTIME_IRQ:
249 case CPUTIME_SOFTIRQ:
250 cstat->cputime.stime += delta_exec;
251 break;
252 default:
253 break;
254 }
255
256 cgroup_cpu_stat_account_end(cgrp, cstat);
257}
258
259void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
260{
261 struct cgroup *cgrp = seq_css(seq)->cgroup;
262 u64 usage, utime, stime;
263
264 if (!cgroup_parent(cgrp))
265 return;
266
267 mutex_lock(&cgroup_stat_mutex);
268
269 cgroup_stat_flush_locked(cgrp);
270
271 usage = cgrp->stat.cputime.sum_exec_runtime;
272 cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
273 &utime, &stime);
274
275 mutex_unlock(&cgroup_stat_mutex);
276
277 do_div(usage, NSEC_PER_USEC);
278 do_div(utime, NSEC_PER_USEC);
279 do_div(stime, NSEC_PER_USEC);
280
281 seq_printf(seq, "%susage_usec %llu\n"
282 "%suser_usec %llu\n"
283 "%ssystem_usec %llu\n",
284 prefix, usage, prefix, utime, prefix, stime);
285}
286
287int cgroup_stat_init(struct cgroup *cgrp)
288{
289 int cpu;
290
291 /* the root cgrp has cpu_stat preallocated */
292 if (!cgrp->cpu_stat) {
293 cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
294 if (!cgrp->cpu_stat)
295 return -ENOMEM;
296 }
297
298 /* ->updated_children list is self terminated */
299 for_each_possible_cpu(cpu)
300 cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
301
302 prev_cputime_init(&cgrp->stat.prev_cputime);
303
304 return 0;
305}
306
307void cgroup_stat_exit(struct cgroup *cgrp)
308{
309 int cpu;
310
311 cgroup_stat_flush(cgrp);
312
313 /* sanity check */
314 for_each_possible_cpu(cpu) {
315 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
316
317 if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
318 WARN_ON_ONCE(cstat->updated_next))
319 return;
320 }
321
322 free_percpu(cgrp->cpu_stat);
323 cgrp->cpu_stat = NULL;
324}
325
326void __init cgroup_stat_boot(void)
327{
328 int cpu;
329
330 for_each_possible_cpu(cpu)
331 raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
332
333 BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
334}