diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-15 17:29:44 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-15 17:29:44 -0500 |
commit | 22714a2ba4b55737cd7d5299db7aaf1fa8287354 (patch) | |
tree | 32b25f2e3e40732156a8a8d0dcb2ddf38410776f /kernel/cgroup | |
parent | 766ec76a27aa9dfdfee3a80f29ddc1f7539c71f9 (diff) | |
parent | 5f2e673405b742be64e7c3604ed4ed3ac14f35ce (diff) |
Merge branch 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
"Cgroup2 cpu controller support is finally merged.
- Basic cpu statistics support to allow monitoring by default without
the CPU controller enabled.
- cgroup2 cpu controller support.
- /sys/kernel/cgroup files to help dealing with new / optional
features"
* 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup: export list of cgroups v2 features using sysfs
cgroup: export list of delegatable control files using sysfs
cgroup: mark @cgrp __maybe_unused in cpu_stat_show()
MAINTAINERS: relocate cpuset.c
cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat
sched: Implement interface for cgroup unified hierarchy
sched: Misc preps for cgroup unified hierarchy interface
sched/cputime: Add dummy cputime_adjust() implementation for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
cgroup: statically initialize init_css_set->dfl_cgrp
cgroup: Implement cgroup2 basic CPU usage accounting
cpuacct: Introduce cgroup_account_cputime[_field]()
sched/cputime: Expose cputime_adjust()
Diffstat (limited to 'kernel/cgroup')
-rw-r--r-- | kernel/cgroup/Makefile | 2 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 9 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 157 | ||||
-rw-r--r-- | kernel/cgroup/stat.c | 334 |
4 files changed, 499 insertions, 3 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ae448f7632cc..2be89a003185 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | 1 | # SPDX-License-Identifier: GPL-2.0 |
2 | obj-y := cgroup.o namespace.o cgroup-v1.o | 2 | obj-y := cgroup.o stat.o namespace.o cgroup-v1.o |
3 | 3 | ||
4 | obj-$(CONFIG_CGROUP_FREEZER) += freezer.o | 4 | obj-$(CONFIG_CGROUP_FREEZER) += freezer.o |
5 | obj-$(CONFIG_CGROUP_PIDS) += pids.o | 5 | obj-$(CONFIG_CGROUP_PIDS) += pids.o |
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bf54ade001be..b928b27050c6 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h | |||
@@ -201,6 +201,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | |||
201 | int cgroup_task_count(const struct cgroup *cgrp); | 201 | int cgroup_task_count(const struct cgroup *cgrp); |
202 | 202 | ||
203 | /* | 203 | /* |
204 | * stat.c | ||
205 | */ | ||
206 | void cgroup_stat_flush(struct cgroup *cgrp); | ||
207 | int cgroup_stat_init(struct cgroup *cgrp); | ||
208 | void cgroup_stat_exit(struct cgroup *cgrp); | ||
209 | void cgroup_stat_show_cputime(struct seq_file *seq); | ||
210 | void cgroup_stat_boot(void); | ||
211 | |||
212 | /* | ||
204 | * namespace.c | 213 | * namespace.c |
205 | */ | 214 | */ |
206 | extern const struct proc_ns_operations cgroupns_operations; | 215 | extern const struct proc_ns_operations cgroupns_operations; |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 00f5b358aeac..0b1ffe147f24 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { | |||
142 | }; | 142 | }; |
143 | #undef SUBSYS | 143 | #undef SUBSYS |
144 | 144 | ||
145 | static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); | ||
146 | |||
145 | /* | 147 | /* |
146 | * The default hierarchy, reserved for the subsystems that are otherwise | 148 | * The default hierarchy, reserved for the subsystems that are otherwise |
147 | * unattached - it never has more than a single cgroup, and all tasks are | 149 | * unattached - it never has more than a single cgroup, and all tasks are |
148 | * part of that cgroup. | 150 | * part of that cgroup. |
149 | */ | 151 | */ |
150 | struct cgroup_root cgrp_dfl_root; | 152 | struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; |
151 | EXPORT_SYMBOL_GPL(cgrp_dfl_root); | 153 | EXPORT_SYMBOL_GPL(cgrp_dfl_root); |
152 | 154 | ||
153 | /* | 155 | /* |
@@ -462,6 +464,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | |||
462 | } | 464 | } |
463 | 465 | ||
464 | /** | 466 | /** |
467 | * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem | ||
468 | * @cgrp: the cgroup of interest | ||
469 | * @ss: the subsystem of interest | ||
470 | * | ||
471 | * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist | ||
472 | * or is offline, %NULL is returned. | ||
473 | */ | ||
474 | static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, | ||
475 | struct cgroup_subsys *ss) | ||
476 | { | ||
477 | struct cgroup_subsys_state *css; | ||
478 | |||
479 | rcu_read_lock(); | ||
480 | css = cgroup_css(cgrp, ss); | ||
481 | if (!css || !css_tryget_online(css)) | ||
482 | css = NULL; | ||
483 | rcu_read_unlock(); | ||
484 | |||
485 | return css; | ||
486 | } | ||
487 | |||
488 | /** | ||
465 | * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem | 489 | * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem |
466 | * @cgrp: the cgroup of interest | 490 | * @cgrp: the cgroup of interest |
467 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) | 491 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) |
@@ -647,6 +671,14 @@ struct css_set init_css_set = { | |||
647 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), | 671 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
648 | .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), | 672 | .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), |
649 | .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), | 673 | .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), |
674 | |||
675 | /* | ||
676 | * The following field is re-initialized when this cset gets linked | ||
677 | * in cgroup_init(). However, let's initialize the field | ||
678 | * statically too so that the default cgroup can be accessed safely | ||
679 | * early during boot. | ||
680 | */ | ||
681 | .dfl_cgrp = &cgrp_dfl_root.cgrp, | ||
650 | }; | 682 | }; |
651 | 683 | ||
652 | static int css_set_count = 1; /* 1 for init_css_set */ | 684 | static int css_set_count = 1; /* 1 for init_css_set */ |
@@ -3315,6 +3347,37 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) | |||
3315 | return 0; | 3347 | return 0; |
3316 | } | 3348 | } |
3317 | 3349 | ||
3350 | static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, | ||
3351 | struct cgroup *cgrp, int ssid) | ||
3352 | { | ||
3353 | struct cgroup_subsys *ss = cgroup_subsys[ssid]; | ||
3354 | struct cgroup_subsys_state *css; | ||
3355 | int ret; | ||
3356 | |||
3357 | if (!ss->css_extra_stat_show) | ||
3358 | return 0; | ||
3359 | |||
3360 | css = cgroup_tryget_css(cgrp, ss); | ||
3361 | if (!css) | ||
3362 | return 0; | ||
3363 | |||
3364 | ret = ss->css_extra_stat_show(seq, css); | ||
3365 | css_put(css); | ||
3366 | return ret; | ||
3367 | } | ||
3368 | |||
3369 | static int cpu_stat_show(struct seq_file *seq, void *v) | ||
3370 | { | ||
3371 | struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; | ||
3372 | int ret = 0; | ||
3373 | |||
3374 | cgroup_stat_show_cputime(seq); | ||
3375 | #ifdef CONFIG_CGROUP_SCHED | ||
3376 | ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); | ||
3377 | #endif | ||
3378 | return ret; | ||
3379 | } | ||
3380 | |||
3318 | static int cgroup_file_open(struct kernfs_open_file *of) | 3381 | static int cgroup_file_open(struct kernfs_open_file *of) |
3319 | { | 3382 | { |
3320 | struct cftype *cft = of->kn->priv; | 3383 | struct cftype *cft = of->kn->priv; |
@@ -4422,6 +4485,11 @@ static struct cftype cgroup_base_files[] = { | |||
4422 | .name = "cgroup.stat", | 4485 | .name = "cgroup.stat", |
4423 | .seq_show = cgroup_stat_show, | 4486 | .seq_show = cgroup_stat_show, |
4424 | }, | 4487 | }, |
4488 | { | ||
4489 | .name = "cpu.stat", | ||
4490 | .flags = CFTYPE_NOT_ON_ROOT, | ||
4491 | .seq_show = cpu_stat_show, | ||
4492 | }, | ||
4425 | { } /* terminate */ | 4493 | { } /* terminate */ |
4426 | }; | 4494 | }; |
4427 | 4495 | ||
@@ -4482,6 +4550,8 @@ static void css_free_work_fn(struct work_struct *work) | |||
4482 | */ | 4550 | */ |
4483 | cgroup_put(cgroup_parent(cgrp)); | 4551 | cgroup_put(cgroup_parent(cgrp)); |
4484 | kernfs_put(cgrp->kn); | 4552 | kernfs_put(cgrp->kn); |
4553 | if (cgroup_on_dfl(cgrp)) | ||
4554 | cgroup_stat_exit(cgrp); | ||
4485 | kfree(cgrp); | 4555 | kfree(cgrp); |
4486 | } else { | 4556 | } else { |
4487 | /* | 4557 | /* |
@@ -4526,6 +4596,9 @@ static void css_release_work_fn(struct work_struct *work) | |||
4526 | /* cgroup release path */ | 4596 | /* cgroup release path */ |
4527 | trace_cgroup_release(cgrp); | 4597 | trace_cgroup_release(cgrp); |
4528 | 4598 | ||
4599 | if (cgroup_on_dfl(cgrp)) | ||
4600 | cgroup_stat_flush(cgrp); | ||
4601 | |||
4529 | for (tcgrp = cgroup_parent(cgrp); tcgrp; | 4602 | for (tcgrp = cgroup_parent(cgrp); tcgrp; |
4530 | tcgrp = cgroup_parent(tcgrp)) | 4603 | tcgrp = cgroup_parent(tcgrp)) |
4531 | tcgrp->nr_dying_descendants--; | 4604 | tcgrp->nr_dying_descendants--; |
@@ -4709,6 +4782,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4709 | if (ret) | 4782 | if (ret) |
4710 | goto out_free_cgrp; | 4783 | goto out_free_cgrp; |
4711 | 4784 | ||
4785 | if (cgroup_on_dfl(parent)) { | ||
4786 | ret = cgroup_stat_init(cgrp); | ||
4787 | if (ret) | ||
4788 | goto out_cancel_ref; | ||
4789 | } | ||
4790 | |||
4712 | /* | 4791 | /* |
4713 | * Temporarily set the pointer to NULL, so idr_find() won't return | 4792 | * Temporarily set the pointer to NULL, so idr_find() won't return |
4714 | * a half-baked cgroup. | 4793 | * a half-baked cgroup. |
@@ -4716,7 +4795,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4716 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); | 4795 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); |
4717 | if (cgrp->id < 0) { | 4796 | if (cgrp->id < 0) { |
4718 | ret = -ENOMEM; | 4797 | ret = -ENOMEM; |
4719 | goto out_cancel_ref; | 4798 | goto out_stat_exit; |
4720 | } | 4799 | } |
4721 | 4800 | ||
4722 | init_cgroup_housekeeping(cgrp); | 4801 | init_cgroup_housekeeping(cgrp); |
@@ -4767,6 +4846,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4767 | 4846 | ||
4768 | out_idr_free: | 4847 | out_idr_free: |
4769 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); | 4848 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
4849 | out_stat_exit: | ||
4850 | if (cgroup_on_dfl(parent)) | ||
4851 | cgroup_stat_exit(cgrp); | ||
4770 | out_cancel_ref: | 4852 | out_cancel_ref: |
4771 | percpu_ref_exit(&cgrp->self.refcnt); | 4853 | percpu_ref_exit(&cgrp->self.refcnt); |
4772 | out_free_cgrp: | 4854 | out_free_cgrp: |
@@ -5161,6 +5243,8 @@ int __init cgroup_init(void) | |||
5161 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); | 5243 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); |
5162 | BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); | 5244 | BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); |
5163 | 5245 | ||
5246 | cgroup_stat_boot(); | ||
5247 | |||
5164 | /* | 5248 | /* |
5165 | * The latency of the synchronize_sched() is too high for cgroups, | 5249 | * The latency of the synchronize_sched() is too high for cgroups, |
5166 | * avoid it at the cost of forcing all readers into the slow path. | 5250 | * avoid it at the cost of forcing all readers into the slow path. |
@@ -5780,3 +5864,72 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, | |||
5780 | return ret; | 5864 | return ret; |
5781 | } | 5865 | } |
5782 | #endif /* CONFIG_CGROUP_BPF */ | 5866 | #endif /* CONFIG_CGROUP_BPF */ |
5867 | |||
5868 | #ifdef CONFIG_SYSFS | ||
5869 | static ssize_t show_delegatable_files(struct cftype *files, char *buf, | ||
5870 | ssize_t size, const char *prefix) | ||
5871 | { | ||
5872 | struct cftype *cft; | ||
5873 | ssize_t ret = 0; | ||
5874 | |||
5875 | for (cft = files; cft && cft->name[0] != '\0'; cft++) { | ||
5876 | if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) | ||
5877 | continue; | ||
5878 | |||
5879 | if (prefix) | ||
5880 | ret += snprintf(buf + ret, size - ret, "%s.", prefix); | ||
5881 | |||
5882 | ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); | ||
5883 | |||
5884 | if (unlikely(ret >= size)) { | ||
5885 | WARN_ON(1); | ||
5886 | break; | ||
5887 | } | ||
5888 | } | ||
5889 | |||
5890 | return ret; | ||
5891 | } | ||
5892 | |||
5893 | static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
5894 | char *buf) | ||
5895 | { | ||
5896 | struct cgroup_subsys *ss; | ||
5897 | int ssid; | ||
5898 | ssize_t ret = 0; | ||
5899 | |||
5900 | ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, | ||
5901 | NULL); | ||
5902 | |||
5903 | for_each_subsys(ss, ssid) | ||
5904 | ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, | ||
5905 | PAGE_SIZE - ret, | ||
5906 | cgroup_subsys_name[ssid]); | ||
5907 | |||
5908 | return ret; | ||
5909 | } | ||
5910 | static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); | ||
5911 | |||
5912 | static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
5913 | char *buf) | ||
5914 | { | ||
5915 | return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); | ||
5916 | } | ||
5917 | static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); | ||
5918 | |||
5919 | static struct attribute *cgroup_sysfs_attrs[] = { | ||
5920 | &cgroup_delegate_attr.attr, | ||
5921 | &cgroup_features_attr.attr, | ||
5922 | NULL, | ||
5923 | }; | ||
5924 | |||
5925 | static const struct attribute_group cgroup_sysfs_attr_group = { | ||
5926 | .attrs = cgroup_sysfs_attrs, | ||
5927 | .name = "cgroup", | ||
5928 | }; | ||
5929 | |||
5930 | static int __init cgroup_sysfs_init(void) | ||
5931 | { | ||
5932 | return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); | ||
5933 | } | ||
5934 | subsys_initcall(cgroup_sysfs_init); | ||
5935 | #endif /* CONFIG_SYSFS */ | ||
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c new file mode 100644 index 000000000000..133b465691d6 --- /dev/null +++ b/kernel/cgroup/stat.c | |||
@@ -0,0 +1,334 @@ | |||
1 | #include "cgroup-internal.h" | ||
2 | |||
3 | #include <linux/sched/cputime.h> | ||
4 | |||
5 | static DEFINE_MUTEX(cgroup_stat_mutex); | ||
6 | static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); | ||
7 | |||
8 | static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) | ||
9 | { | ||
10 | return per_cpu_ptr(cgrp->cpu_stat, cpu); | ||
11 | } | ||
12 | |||
13 | /** | ||
14 | * cgroup_cpu_stat_updated - keep track of updated cpu_stat | ||
15 | * @cgrp: target cgroup | ||
16 | * @cpu: cpu on which cpu_stat was updated | ||
17 | * | ||
18 | * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching | ||
19 | * cpu_stat->updated_children list. See the comment on top of | ||
20 | * cgroup_cpu_stat definition for details. | ||
21 | */ | ||
22 | static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) | ||
23 | { | ||
24 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); | ||
25 | struct cgroup *parent; | ||
26 | unsigned long flags; | ||
27 | |||
28 | /* | ||
29 | * Speculative already-on-list test. This may race leading to | ||
30 | * temporary inaccuracies, which is fine. | ||
31 | * | ||
32 | * Because @parent's updated_children is terminated with @parent | ||
33 | * instead of NULL, we can tell whether @cgrp is on the list by | ||
34 | * testing the next pointer for NULL. | ||
35 | */ | ||
36 | if (cgroup_cpu_stat(cgrp, cpu)->updated_next) | ||
37 | return; | ||
38 | |||
39 | raw_spin_lock_irqsave(cpu_lock, flags); | ||
40 | |||
41 | /* put @cgrp and all ancestors on the corresponding updated lists */ | ||
42 | for (parent = cgroup_parent(cgrp); parent; | ||
43 | cgrp = parent, parent = cgroup_parent(cgrp)) { | ||
44 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | ||
45 | struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); | ||
46 | |||
47 | /* | ||
48 | * Both additions and removals are bottom-up. If a cgroup | ||
49 | * is already in the tree, all ancestors are. | ||
50 | */ | ||
51 | if (cstat->updated_next) | ||
52 | break; | ||
53 | |||
54 | cstat->updated_next = pcstat->updated_children; | ||
55 | pcstat->updated_children = cgrp; | ||
56 | } | ||
57 | |||
58 | raw_spin_unlock_irqrestore(cpu_lock, flags); | ||
59 | } | ||
60 | |||
61 | /** | ||
62 | * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree | ||
63 | * @pos: current position | ||
64 | * @root: root of the tree to traversal | ||
65 | * @cpu: target cpu | ||
66 | * | ||
67 | * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts | ||
68 | * the traversal and %NULL return indicates the end. During traversal, | ||
69 | * each returned cgroup is unlinked from the tree. Must be called with the | ||
70 | * matching cgroup_cpu_stat_lock held. | ||
71 | * | ||
72 | * The only ordering guarantee is that, for a parent and a child pair | ||
73 | * covered by a given traversal, if a child is visited, its parent is | ||
74 | * guaranteed to be visited afterwards. | ||
75 | */ | ||
76 | static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, | ||
77 | struct cgroup *root, int cpu) | ||
78 | { | ||
79 | struct cgroup_cpu_stat *cstat; | ||
80 | struct cgroup *parent; | ||
81 | |||
82 | if (pos == root) | ||
83 | return NULL; | ||
84 | |||
85 | /* | ||
86 | * We're gonna walk down to the first leaf and visit/remove it. We | ||
87 | * can pick whatever unvisited node as the starting point. | ||
88 | */ | ||
89 | if (!pos) | ||
90 | pos = root; | ||
91 | else | ||
92 | pos = cgroup_parent(pos); | ||
93 | |||
94 | /* walk down to the first leaf */ | ||
95 | while (true) { | ||
96 | cstat = cgroup_cpu_stat(pos, cpu); | ||
97 | if (cstat->updated_children == pos) | ||
98 | break; | ||
99 | pos = cstat->updated_children; | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * Unlink @pos from the tree. As the updated_children list is | ||
104 | * singly linked, we have to walk it to find the removal point. | ||
105 | * However, due to the way we traverse, @pos will be the first | ||
106 | * child in most cases. The only exception is @root. | ||
107 | */ | ||
108 | parent = cgroup_parent(pos); | ||
109 | if (parent && cstat->updated_next) { | ||
110 | struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); | ||
111 | struct cgroup_cpu_stat *ncstat; | ||
112 | struct cgroup **nextp; | ||
113 | |||
114 | nextp = &pcstat->updated_children; | ||
115 | while (true) { | ||
116 | ncstat = cgroup_cpu_stat(*nextp, cpu); | ||
117 | if (*nextp == pos) | ||
118 | break; | ||
119 | |||
120 | WARN_ON_ONCE(*nextp == parent); | ||
121 | nextp = &ncstat->updated_next; | ||
122 | } | ||
123 | |||
124 | *nextp = cstat->updated_next; | ||
125 | cstat->updated_next = NULL; | ||
126 | } | ||
127 | |||
128 | return pos; | ||
129 | } | ||
130 | |||
131 | static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, | ||
132 | struct cgroup_stat *src_stat) | ||
133 | { | ||
134 | dst_stat->cputime.utime += src_stat->cputime.utime; | ||
135 | dst_stat->cputime.stime += src_stat->cputime.stime; | ||
136 | dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; | ||
137 | } | ||
138 | |||
139 | static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) | ||
140 | { | ||
141 | struct cgroup *parent = cgroup_parent(cgrp); | ||
142 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | ||
143 | struct task_cputime *last_cputime = &cstat->last_cputime; | ||
144 | struct task_cputime cputime; | ||
145 | struct cgroup_stat delta; | ||
146 | unsigned seq; | ||
147 | |||
148 | lockdep_assert_held(&cgroup_stat_mutex); | ||
149 | |||
150 | /* fetch the current per-cpu values */ | ||
151 | do { | ||
152 | seq = __u64_stats_fetch_begin(&cstat->sync); | ||
153 | cputime = cstat->cputime; | ||
154 | } while (__u64_stats_fetch_retry(&cstat->sync, seq)); | ||
155 | |||
156 | /* accumulate the deltas to propgate */ | ||
157 | delta.cputime.utime = cputime.utime - last_cputime->utime; | ||
158 | delta.cputime.stime = cputime.stime - last_cputime->stime; | ||
159 | delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - | ||
160 | last_cputime->sum_exec_runtime; | ||
161 | *last_cputime = cputime; | ||
162 | |||
163 | /* transfer the pending stat into delta */ | ||
164 | cgroup_stat_accumulate(&delta, &cgrp->pending_stat); | ||
165 | memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); | ||
166 | |||
167 | /* propagate delta into the global stat and the parent's pending */ | ||
168 | cgroup_stat_accumulate(&cgrp->stat, &delta); | ||
169 | if (parent) | ||
170 | cgroup_stat_accumulate(&parent->pending_stat, &delta); | ||
171 | } | ||
172 | |||
173 | /* see cgroup_stat_flush() */ | ||
174 | static void cgroup_stat_flush_locked(struct cgroup *cgrp) | ||
175 | { | ||
176 | int cpu; | ||
177 | |||
178 | lockdep_assert_held(&cgroup_stat_mutex); | ||
179 | |||
180 | for_each_possible_cpu(cpu) { | ||
181 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); | ||
182 | struct cgroup *pos = NULL; | ||
183 | |||
184 | raw_spin_lock_irq(cpu_lock); | ||
185 | while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) | ||
186 | cgroup_cpu_stat_flush_one(pos, cpu); | ||
187 | raw_spin_unlock_irq(cpu_lock); | ||
188 | } | ||
189 | } | ||
190 | |||
191 | /** | ||
192 | * cgroup_stat_flush - flush stats in @cgrp's subtree | ||
193 | * @cgrp: target cgroup | ||
194 | * | ||
195 | * Collect all per-cpu stats in @cgrp's subtree into the global counters | ||
196 | * and propagate them upwards. After this function returns, all cgroups in | ||
197 | * the subtree have up-to-date ->stat. | ||
198 | * | ||
199 | * This also gets all cgroups in the subtree including @cgrp off the | ||
200 | * ->updated_children lists. | ||
201 | */ | ||
202 | void cgroup_stat_flush(struct cgroup *cgrp) | ||
203 | { | ||
204 | mutex_lock(&cgroup_stat_mutex); | ||
205 | cgroup_stat_flush_locked(cgrp); | ||
206 | mutex_unlock(&cgroup_stat_mutex); | ||
207 | } | ||
208 | |||
209 | static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) | ||
210 | { | ||
211 | struct cgroup_cpu_stat *cstat; | ||
212 | |||
213 | cstat = get_cpu_ptr(cgrp->cpu_stat); | ||
214 | u64_stats_update_begin(&cstat->sync); | ||
215 | return cstat; | ||
216 | } | ||
217 | |||
218 | static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, | ||
219 | struct cgroup_cpu_stat *cstat) | ||
220 | { | ||
221 | u64_stats_update_end(&cstat->sync); | ||
222 | cgroup_cpu_stat_updated(cgrp, smp_processor_id()); | ||
223 | put_cpu_ptr(cstat); | ||
224 | } | ||
225 | |||
226 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) | ||
227 | { | ||
228 | struct cgroup_cpu_stat *cstat; | ||
229 | |||
230 | cstat = cgroup_cpu_stat_account_begin(cgrp); | ||
231 | cstat->cputime.sum_exec_runtime += delta_exec; | ||
232 | cgroup_cpu_stat_account_end(cgrp, cstat); | ||
233 | } | ||
234 | |||
235 | void __cgroup_account_cputime_field(struct cgroup *cgrp, | ||
236 | enum cpu_usage_stat index, u64 delta_exec) | ||
237 | { | ||
238 | struct cgroup_cpu_stat *cstat; | ||
239 | |||
240 | cstat = cgroup_cpu_stat_account_begin(cgrp); | ||
241 | |||
242 | switch (index) { | ||
243 | case CPUTIME_USER: | ||
244 | case CPUTIME_NICE: | ||
245 | cstat->cputime.utime += delta_exec; | ||
246 | break; | ||
247 | case CPUTIME_SYSTEM: | ||
248 | case CPUTIME_IRQ: | ||
249 | case CPUTIME_SOFTIRQ: | ||
250 | cstat->cputime.stime += delta_exec; | ||
251 | break; | ||
252 | default: | ||
253 | break; | ||
254 | } | ||
255 | |||
256 | cgroup_cpu_stat_account_end(cgrp, cstat); | ||
257 | } | ||
258 | |||
259 | void cgroup_stat_show_cputime(struct seq_file *seq) | ||
260 | { | ||
261 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
262 | u64 usage, utime, stime; | ||
263 | |||
264 | if (!cgroup_parent(cgrp)) | ||
265 | return; | ||
266 | |||
267 | mutex_lock(&cgroup_stat_mutex); | ||
268 | |||
269 | cgroup_stat_flush_locked(cgrp); | ||
270 | |||
271 | usage = cgrp->stat.cputime.sum_exec_runtime; | ||
272 | cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, | ||
273 | &utime, &stime); | ||
274 | |||
275 | mutex_unlock(&cgroup_stat_mutex); | ||
276 | |||
277 | do_div(usage, NSEC_PER_USEC); | ||
278 | do_div(utime, NSEC_PER_USEC); | ||
279 | do_div(stime, NSEC_PER_USEC); | ||
280 | |||
281 | seq_printf(seq, "usage_usec %llu\n" | ||
282 | "user_usec %llu\n" | ||
283 | "system_usec %llu\n", | ||
284 | usage, utime, stime); | ||
285 | } | ||
286 | |||
287 | int cgroup_stat_init(struct cgroup *cgrp) | ||
288 | { | ||
289 | int cpu; | ||
290 | |||
291 | /* the root cgrp has cpu_stat preallocated */ | ||
292 | if (!cgrp->cpu_stat) { | ||
293 | cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); | ||
294 | if (!cgrp->cpu_stat) | ||
295 | return -ENOMEM; | ||
296 | } | ||
297 | |||
298 | /* ->updated_children list is self terminated */ | ||
299 | for_each_possible_cpu(cpu) | ||
300 | cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; | ||
301 | |||
302 | prev_cputime_init(&cgrp->stat.prev_cputime); | ||
303 | |||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | void cgroup_stat_exit(struct cgroup *cgrp) | ||
308 | { | ||
309 | int cpu; | ||
310 | |||
311 | cgroup_stat_flush(cgrp); | ||
312 | |||
313 | /* sanity check */ | ||
314 | for_each_possible_cpu(cpu) { | ||
315 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | ||
316 | |||
317 | if (WARN_ON_ONCE(cstat->updated_children != cgrp) || | ||
318 | WARN_ON_ONCE(cstat->updated_next)) | ||
319 | return; | ||
320 | } | ||
321 | |||
322 | free_percpu(cgrp->cpu_stat); | ||
323 | cgrp->cpu_stat = NULL; | ||
324 | } | ||
325 | |||
326 | void __init cgroup_stat_boot(void) | ||
327 | { | ||
328 | int cpu; | ||
329 | |||
330 | for_each_possible_cpu(cpu) | ||
331 | raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); | ||
332 | |||
333 | BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); | ||
334 | } | ||