summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2017-09-25 11:12:05 -0400
committerTejun Heo <tj@kernel.org>2017-09-25 11:12:05 -0400
commit041cd640b2f3c5607171c59d8712b503659d21f7 (patch)
tree2979112393aefa10e23245ae95f481763280dd6f
parentd2cc5ed6949085cfba30ec5228816cf6eb1d02b9 (diff)
cgroup: Implement cgroup2 basic CPU usage accounting
In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Waiman Long <longman@redhat.com> Cc: Roman Gushchin <guro@fb.com>
-rw-r--r--Documentation/cgroup-v2.txt9
-rw-r--r--include/linux/cgroup-defs.h57
-rw-r--r--include/linux/cgroup.h22
-rw-r--r--kernel/cgroup/Makefile2
-rw-r--r--kernel/cgroup/cgroup-internal.h8
-rw-r--r--kernel/cgroup/cgroup.c24
-rw-r--r--kernel/cgroup/stat.c334
7 files changed, 453 insertions, 3 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dc44785dc0fa..3f8216912df0 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -886,6 +886,15 @@ All cgroup core files are prefixed with "cgroup."
886 A dying cgroup can consume system resources not exceeding 886 A dying cgroup can consume system resources not exceeding
887 limits, which were active at the moment of cgroup deletion. 887 limits, which were active at the moment of cgroup deletion.
888 888
889 cpu.usage_usec
890 CPU time consumed in the subtree.
891
892 cpu.user_usec
893 User CPU time consumed in the subtree.
894
895 cpu.system_usec
896 System CPU time consumed in the subtree.
897
889 898
890Controllers 899Controllers
891=========== 900===========
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ade4a78a54c2..3e55bbd31ad1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
16#include <linux/refcount.h> 16#include <linux/refcount.h>
17#include <linux/percpu-refcount.h> 17#include <linux/percpu-refcount.h>
18#include <linux/percpu-rwsem.h> 18#include <linux/percpu-rwsem.h>
19#include <linux/u64_stats_sync.h>
19#include <linux/workqueue.h> 20#include <linux/workqueue.h>
20#include <linux/bpf-cgroup.h> 21#include <linux/bpf-cgroup.h>
21 22
@@ -254,6 +255,57 @@ struct css_set {
254 struct rcu_head rcu_head; 255 struct rcu_head rcu_head;
255}; 256};
256 257
258/*
259 * cgroup basic resource usage statistics. Accounting is done per-cpu in
260 * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
261 * reads.
262 *
263 * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
264 * linked into the updated tree. On the following read, propagation only
265 * considers and consumes the updated tree. This makes reading O(the
266 * number of descendants which have been active since last read) instead of
267 * O(the total number of descendants).
268 *
269 * This is important because there can be a lot of (draining) cgroups which
270 * aren't active and stat may be read frequently. The combination can
271 * become very expensive. By propagating selectively, increasing reading
272 * frequency decreases the cost of each read.
273 */
274struct cgroup_cpu_stat {
275 /*
276 * ->sync protects all the current counters. These are the only
277 * fields which get updated in the hot path.
278 */
279 struct u64_stats_sync sync;
280 struct task_cputime cputime;
281
282 /*
283 * Snapshots at the last reading. These are used to calculate the
284 * deltas to propagate to the global counters.
285 */
286 struct task_cputime last_cputime;
287
288 /*
289 * Child cgroups with stat updates on this cpu since the last read
290 * are linked on the parent's ->updated_children through
291 * ->updated_next.
292 *
293 * In addition to being more compact, singly-linked list pointing
294 * to the cgroup makes it unnecessary for each per-cpu struct to
295 * point back to the associated cgroup.
296 *
297 * Protected by per-cpu cgroup_cpu_stat_lock.
298 */
299 struct cgroup *updated_children; /* terminated by self cgroup */
300 struct cgroup *updated_next; /* NULL iff not on the list */
301};
302
303struct cgroup_stat {
304 /* per-cpu statistics are collected into the folowing global counters */
305 struct task_cputime cputime;
306 struct prev_cputime prev_cputime;
307};
308
257struct cgroup { 309struct cgroup {
258 /* self css with NULL ->ss, points back to this cgroup */ 310 /* self css with NULL ->ss, points back to this cgroup */
259 struct cgroup_subsys_state self; 311 struct cgroup_subsys_state self;
@@ -353,6 +405,11 @@ struct cgroup {
353 */ 405 */
354 struct cgroup *dom_cgrp; 406 struct cgroup *dom_cgrp;
355 407
408 /* cgroup basic resource statistics */
409 struct cgroup_cpu_stat __percpu *cpu_stat;
410 struct cgroup_stat pending_stat; /* pending from children */
411 struct cgroup_stat stat;
412
356 /* 413 /*
357 * list of pidlists, up to two for each namespace (one for procs, one 414 * list of pidlists, up to two for each namespace (one for procs, one
358 * for tasks); created on demand. 415 * for tasks); created on demand.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6cd579329310..328a70ce0e23 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -703,17 +703,39 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
703 u64 val) {} 703 u64 val) {}
704#endif 704#endif
705 705
706void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
707
708void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
709void __cgroup_account_cputime_field(struct cgroup *cgrp,
710 enum cpu_usage_stat index, u64 delta_exec);
711
706static inline void cgroup_account_cputime(struct task_struct *task, 712static inline void cgroup_account_cputime(struct task_struct *task,
707 u64 delta_exec) 713 u64 delta_exec)
708{ 714{
715 struct cgroup *cgrp;
716
709 cpuacct_charge(task, delta_exec); 717 cpuacct_charge(task, delta_exec);
718
719 rcu_read_lock();
720 cgrp = task_dfl_cgroup(task);
721 if (cgroup_parent(cgrp))
722 __cgroup_account_cputime(cgrp, delta_exec);
723 rcu_read_unlock();
710} 724}
711 725
712static inline void cgroup_account_cputime_field(struct task_struct *task, 726static inline void cgroup_account_cputime_field(struct task_struct *task,
713 enum cpu_usage_stat index, 727 enum cpu_usage_stat index,
714 u64 delta_exec) 728 u64 delta_exec)
715{ 729{
730 struct cgroup *cgrp;
731
716 cpuacct_account_field(task, index, delta_exec); 732 cpuacct_account_field(task, index, delta_exec);
733
734 rcu_read_lock();
735 cgrp = task_dfl_cgroup(task);
736 if (cgroup_parent(cgrp))
737 __cgroup_account_cputime_field(cgrp, index, delta_exec);
738 rcu_read_unlock();
717} 739}
718 740
719#else /* CONFIG_CGROUPS */ 741#else /* CONFIG_CGROUPS */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..0acee616e06c 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
1obj-y := cgroup.o namespace.o cgroup-v1.o 1obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
2 2
3obj-$(CONFIG_CGROUP_FREEZER) += freezer.o 3obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
4obj-$(CONFIG_CGROUP_PIDS) += pids.o 4obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5151ff256c29..fa642c99586a 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -200,6 +200,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
200int cgroup_task_count(const struct cgroup *cgrp); 200int cgroup_task_count(const struct cgroup *cgrp);
201 201
202/* 202/*
203 * stat.c
204 */
205void cgroup_stat_flush(struct cgroup *cgrp);
206int cgroup_stat_init(struct cgroup *cgrp);
207void cgroup_stat_exit(struct cgroup *cgrp);
208void cgroup_stat_boot(void);
209
210/*
203 * namespace.c 211 * namespace.c
204 */ 212 */
205extern const struct proc_ns_operations cgroupns_operations; 213extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..d036625556c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
142}; 142};
143#undef SUBSYS 143#undef SUBSYS
144 144
145static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
146
145/* 147/*
146 * The default hierarchy, reserved for the subsystems that are otherwise 148 * The default hierarchy, reserved for the subsystems that are otherwise
147 * unattached - it never has more than a single cgroup, and all tasks are 149 * unattached - it never has more than a single cgroup, and all tasks are
148 * part of that cgroup. 150 * part of that cgroup.
149 */ 151 */
150struct cgroup_root cgrp_dfl_root; 152struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
151EXPORT_SYMBOL_GPL(cgrp_dfl_root); 153EXPORT_SYMBOL_GPL(cgrp_dfl_root);
152 154
153/* 155/*
@@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
3301 seq_printf(seq, "nr_dying_descendants %d\n", 3303 seq_printf(seq, "nr_dying_descendants %d\n",
3302 cgroup->nr_dying_descendants); 3304 cgroup->nr_dying_descendants);
3303 3305
3306 cgroup_stat_show_cputime(seq, "cpu.");
3307
3304 return 0; 3308 return 0;
3305} 3309}
3306 3310
@@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work)
4471 */ 4475 */
4472 cgroup_put(cgroup_parent(cgrp)); 4476 cgroup_put(cgroup_parent(cgrp));
4473 kernfs_put(cgrp->kn); 4477 kernfs_put(cgrp->kn);
4478 if (cgroup_on_dfl(cgrp))
4479 cgroup_stat_exit(cgrp);
4474 kfree(cgrp); 4480 kfree(cgrp);
4475 } else { 4481 } else {
4476 /* 4482 /*
@@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work)
4515 /* cgroup release path */ 4521 /* cgroup release path */
4516 trace_cgroup_release(cgrp); 4522 trace_cgroup_release(cgrp);
4517 4523
4524 if (cgroup_on_dfl(cgrp))
4525 cgroup_stat_flush(cgrp);
4526
4518 for (tcgrp = cgroup_parent(cgrp); tcgrp; 4527 for (tcgrp = cgroup_parent(cgrp); tcgrp;
4519 tcgrp = cgroup_parent(tcgrp)) 4528 tcgrp = cgroup_parent(tcgrp))
4520 tcgrp->nr_dying_descendants--; 4529 tcgrp->nr_dying_descendants--;
@@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4698 if (ret) 4707 if (ret)
4699 goto out_free_cgrp; 4708 goto out_free_cgrp;
4700 4709
4710 if (cgroup_on_dfl(parent)) {
4711 ret = cgroup_stat_init(cgrp);
4712 if (ret)
4713 goto out_cancel_ref;
4714 }
4715
4701 /* 4716 /*
4702 * Temporarily set the pointer to NULL, so idr_find() won't return 4717 * Temporarily set the pointer to NULL, so idr_find() won't return
4703 * a half-baked cgroup. 4718 * a half-baked cgroup.
@@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4705 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); 4720 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4706 if (cgrp->id < 0) { 4721 if (cgrp->id < 0) {
4707 ret = -ENOMEM; 4722 ret = -ENOMEM;
4708 goto out_cancel_ref; 4723 goto out_stat_exit;
4709 } 4724 }
4710 4725
4711 init_cgroup_housekeeping(cgrp); 4726 init_cgroup_housekeeping(cgrp);
@@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4754 4769
4755 return cgrp; 4770 return cgrp;
4756 4771
4772out_stat_exit:
4773 if (cgroup_on_dfl(parent))
4774 cgroup_stat_exit(cgrp);
4757out_cancel_ref: 4775out_cancel_ref:
4758 percpu_ref_exit(&cgrp->self.refcnt); 4776 percpu_ref_exit(&cgrp->self.refcnt);
4759out_free_cgrp: 4777out_free_cgrp:
@@ -5148,6 +5166,8 @@ int __init cgroup_init(void)
5148 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 5166 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5149 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); 5167 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5150 5168
5169 cgroup_stat_boot();
5170
5151 /* 5171 /*
5152 * The latency of the synchronize_sched() is too high for cgroups, 5172 * The latency of the synchronize_sched() is too high for cgroups,
5153 * avoid it at the cost of forcing all readers into the slow path. 5173 * avoid it at the cost of forcing all readers into the slow path.
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..9cce79e89320
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
1#include "cgroup-internal.h"
2
3#include <linux/sched/cputime.h>
4
5static DEFINE_MUTEX(cgroup_stat_mutex);
6static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
7
8static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
9{
10 return per_cpu_ptr(cgrp->cpu_stat, cpu);
11}
12
13/**
14 * cgroup_cpu_stat_updated - keep track of updated cpu_stat
15 * @cgrp: target cgroup
16 * @cpu: cpu on which cpu_stat was updated
17 *
18 * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
19 * cpu_stat->updated_children list. See the comment on top of
20 * cgroup_cpu_stat definition for details.
21 */
22static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
23{
24 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
25 struct cgroup *parent;
26 unsigned long flags;
27
28 /*
29 * Speculative already-on-list test. This may race leading to
30 * temporary inaccuracies, which is fine.
31 *
32 * Because @parent's updated_children is terminated with @parent
33 * instead of NULL, we can tell whether @cgrp is on the list by
34 * testing the next pointer for NULL.
35 */
36 if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
37 return;
38
39 raw_spin_lock_irqsave(cpu_lock, flags);
40
41 /* put @cgrp and all ancestors on the corresponding updated lists */
42 for (parent = cgroup_parent(cgrp); parent;
43 cgrp = parent, parent = cgroup_parent(cgrp)) {
44 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
45 struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
46
47 /*
48 * Both additions and removals are bottom-up. If a cgroup
49 * is already in the tree, all ancestors are.
50 */
51 if (cstat->updated_next)
52 break;
53
54 cstat->updated_next = pcstat->updated_children;
55 pcstat->updated_children = cgrp;
56 }
57
58 raw_spin_unlock_irqrestore(cpu_lock, flags);
59}
60
61/**
62 * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
63 * @pos: current position
64 * @root: root of the tree to traversal
65 * @cpu: target cpu
66 *
67 * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
68 * the traversal and %NULL return indicates the end. During traversal,
69 * each returned cgroup is unlinked from the tree. Must be called with the
70 * matching cgroup_cpu_stat_lock held.
71 *
72 * The only ordering guarantee is that, for a parent and a child pair
73 * covered by a given traversal, if a child is visited, its parent is
74 * guaranteed to be visited afterwards.
75 */
76static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
77 struct cgroup *root, int cpu)
78{
79 struct cgroup_cpu_stat *cstat;
80 struct cgroup *parent;
81
82 if (pos == root)
83 return NULL;
84
85 /*
86 * We're gonna walk down to the first leaf and visit/remove it. We
87 * can pick whatever unvisited node as the starting point.
88 */
89 if (!pos)
90 pos = root;
91 else
92 pos = cgroup_parent(pos);
93
94 /* walk down to the first leaf */
95 while (true) {
96 cstat = cgroup_cpu_stat(pos, cpu);
97 if (cstat->updated_children == pos)
98 break;
99 pos = cstat->updated_children;
100 }
101
102 /*
103 * Unlink @pos from the tree. As the updated_children list is
104 * singly linked, we have to walk it to find the removal point.
105 * However, due to the way we traverse, @pos will be the first
106 * child in most cases. The only exception is @root.
107 */
108 parent = cgroup_parent(pos);
109 if (parent && cstat->updated_next) {
110 struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
111 struct cgroup_cpu_stat *ncstat;
112 struct cgroup **nextp;
113
114 nextp = &pcstat->updated_children;
115 while (true) {
116 ncstat = cgroup_cpu_stat(*nextp, cpu);
117 if (*nextp == pos)
118 break;
119
120 WARN_ON_ONCE(*nextp == parent);
121 nextp = &ncstat->updated_next;
122 }
123
124 *nextp = cstat->updated_next;
125 cstat->updated_next = NULL;
126 }
127
128 return pos;
129}
130
131static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
132 struct cgroup_stat *src_stat)
133{
134 dst_stat->cputime.utime += src_stat->cputime.utime;
135 dst_stat->cputime.stime += src_stat->cputime.stime;
136 dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
137}
138
139static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
140{
141 struct cgroup *parent = cgroup_parent(cgrp);
142 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
143 struct task_cputime *last_cputime = &cstat->last_cputime;
144 struct task_cputime cputime;
145 struct cgroup_stat delta;
146 unsigned seq;
147
148 lockdep_assert_held(&cgroup_stat_mutex);
149
150 /* fetch the current per-cpu values */
151 do {
152 seq = __u64_stats_fetch_begin(&cstat->sync);
153 cputime = cstat->cputime;
154 } while (__u64_stats_fetch_retry(&cstat->sync, seq));
155
156 /* accumulate the deltas to propgate */
157 delta.cputime.utime = cputime.utime - last_cputime->utime;
158 delta.cputime.stime = cputime.stime - last_cputime->stime;
159 delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
160 last_cputime->sum_exec_runtime;
161 *last_cputime = cputime;
162
163 /* transfer the pending stat into delta */
164 cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
165 memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
166
167 /* propagate delta into the global stat and the parent's pending */
168 cgroup_stat_accumulate(&cgrp->stat, &delta);
169 if (parent)
170 cgroup_stat_accumulate(&parent->pending_stat, &delta);
171}
172
173/* see cgroup_stat_flush() */
174static void cgroup_stat_flush_locked(struct cgroup *cgrp)
175{
176 int cpu;
177
178 lockdep_assert_held(&cgroup_stat_mutex);
179
180 for_each_possible_cpu(cpu) {
181 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
182 struct cgroup *pos = NULL;
183
184 raw_spin_lock_irq(cpu_lock);
185 while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
186 cgroup_cpu_stat_flush_one(pos, cpu);
187 raw_spin_unlock_irq(cpu_lock);
188 }
189}
190
191/**
192 * cgroup_stat_flush - flush stats in @cgrp's subtree
193 * @cgrp: target cgroup
194 *
195 * Collect all per-cpu stats in @cgrp's subtree into the global counters
196 * and propagate them upwards. After this function returns, all cgroups in
197 * the subtree have up-to-date ->stat.
198 *
199 * This also gets all cgroups in the subtree including @cgrp off the
200 * ->updated_children lists.
201 */
202void cgroup_stat_flush(struct cgroup *cgrp)
203{
204 mutex_lock(&cgroup_stat_mutex);
205 cgroup_stat_flush_locked(cgrp);
206 mutex_unlock(&cgroup_stat_mutex);
207}
208
209static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
210{
211 struct cgroup_cpu_stat *cstat;
212
213 cstat = get_cpu_ptr(cgrp->cpu_stat);
214 u64_stats_update_begin(&cstat->sync);
215 return cstat;
216}
217
218static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
219 struct cgroup_cpu_stat *cstat)
220{
221 u64_stats_update_end(&cstat->sync);
222 cgroup_cpu_stat_updated(cgrp, smp_processor_id());
223 put_cpu_ptr(cstat);
224}
225
226void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
227{
228 struct cgroup_cpu_stat *cstat;
229
230 cstat = cgroup_cpu_stat_account_begin(cgrp);
231 cstat->cputime.sum_exec_runtime += delta_exec;
232 cgroup_cpu_stat_account_end(cgrp, cstat);
233}
234
235void __cgroup_account_cputime_field(struct cgroup *cgrp,
236 enum cpu_usage_stat index, u64 delta_exec)
237{
238 struct cgroup_cpu_stat *cstat;
239
240 cstat = cgroup_cpu_stat_account_begin(cgrp);
241
242 switch (index) {
243 case CPUTIME_USER:
244 case CPUTIME_NICE:
245 cstat->cputime.utime += delta_exec;
246 break;
247 case CPUTIME_SYSTEM:
248 case CPUTIME_IRQ:
249 case CPUTIME_SOFTIRQ:
250 cstat->cputime.stime += delta_exec;
251 break;
252 default:
253 break;
254 }
255
256 cgroup_cpu_stat_account_end(cgrp, cstat);
257}
258
259void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
260{
261 struct cgroup *cgrp = seq_css(seq)->cgroup;
262 u64 usage, utime, stime;
263
264 if (!cgroup_parent(cgrp))
265 return;
266
267 mutex_lock(&cgroup_stat_mutex);
268
269 cgroup_stat_flush_locked(cgrp);
270
271 usage = cgrp->stat.cputime.sum_exec_runtime;
272 cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
273 &utime, &stime);
274
275 mutex_unlock(&cgroup_stat_mutex);
276
277 do_div(usage, NSEC_PER_USEC);
278 do_div(utime, NSEC_PER_USEC);
279 do_div(stime, NSEC_PER_USEC);
280
281 seq_printf(seq, "%susage_usec %llu\n"
282 "%suser_usec %llu\n"
283 "%ssystem_usec %llu\n",
284 prefix, usage, prefix, utime, prefix, stime);
285}
286
287int cgroup_stat_init(struct cgroup *cgrp)
288{
289 int cpu;
290
291 /* the root cgrp has cpu_stat preallocated */
292 if (!cgrp->cpu_stat) {
293 cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
294 if (!cgrp->cpu_stat)
295 return -ENOMEM;
296 }
297
298 /* ->updated_children list is self terminated */
299 for_each_possible_cpu(cpu)
300 cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
301
302 prev_cputime_init(&cgrp->stat.prev_cputime);
303
304 return 0;
305}
306
307void cgroup_stat_exit(struct cgroup *cgrp)
308{
309 int cpu;
310
311 cgroup_stat_flush(cgrp);
312
313 /* sanity check */
314 for_each_possible_cpu(cpu) {
315 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
316
317 if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
318 WARN_ON_ONCE(cstat->updated_next))
319 return;
320 }
321
322 free_percpu(cgrp->cpu_stat);
323 cgrp->cpu_stat = NULL;
324}
325
326void __init cgroup_stat_boot(void)
327{
328 int cpu;
329
330 for_each_possible_cpu(cpu)
331 raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
332
333 BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
334}