cgroup: Implement cgroup2 basic CPU usage accounting

In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Waiman Long <longman@redhat.com> Cc: Roman Gushchin <guro@fb.com>
author: Tejun Heo <tj@kernel.org> 2017-09-25 11:12:05 -0400
committer: Tejun Heo <tj@kernel.org> 2017-09-25 11:12:05 -0400
commit: 041cd640b2f3c5607171c59d8712b503659d21f7 (patch)
tree: 2979112393aefa10e23245ae95f481763280dd6f /kernel/cgroup
parent: d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 (diff)
4 files changed, 365 insertions, 3 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..0acee616e06c 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
-obj-y := cgroup.o namespace.o cgroup-v1.o
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5151ff256c29..fa642c99586a 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -200,6 +200,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 int cgroup_task_count(const struct cgroup *cgrp);
 /*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_boot(void);
+/*
 * namespace.c
 */
 extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..d036625556c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
 /*
 * The default hierarchy, reserved for the subsystems that are otherwise
 * unattached - it never has more than a single cgroup, and all tasks are
 * part of that cgroup.
 */
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 /*
@@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
        seq_printf(seq, "nr_dying_descendants %d\n",
                   cgroup->nr_dying_descendants);
+        cgroup_stat_show_cputime(seq, "cpu.");
        return 0;
 }
@@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work)
                         */
                        cgroup_put(cgroup_parent(cgrp));
                        kernfs_put(cgrp->kn);
+                        if (cgroup_on_dfl(cgrp))
+                                cgroup_stat_exit(cgrp);
                        kfree(cgrp);
                } else {
                        /*
@@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work)
                /* cgroup release path */
                trace_cgroup_release(cgrp);
+                if (cgroup_on_dfl(cgrp))
+                        cgroup_stat_flush(cgrp);
                for (tcgrp = cgroup_parent(cgrp); tcgrp;
                     tcgrp = cgroup_parent(tcgrp))
                        tcgrp->nr_dying_descendants--;
@@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
        if (ret)
                goto out_free_cgrp;
+        if (cgroup_on_dfl(parent)) {
+                ret = cgroup_stat_init(cgrp);
+                if (ret)
+                        goto out_cancel_ref;
+        }
        /*
         * Temporarily set the pointer to NULL, so idr_find() won't return
         * a half-baked cgroup.
@@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
        cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
        if (cgrp->id < 0) {
                ret = -ENOMEM;
-                goto out_cancel_ref;
+                goto out_stat_exit;
        }
        init_cgroup_housekeeping(cgrp);
@@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
        return cgrp;
+out_stat_exit:
+        if (cgroup_on_dfl(parent))
+                cgroup_stat_exit(cgrp);
 out_cancel_ref:
        percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5148,6 +5166,8 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
+        cgroup_stat_boot();
        /*
         * The latency of the synchronize_sched() is too high for cgroups,
         * avoid it at the cost of forcing all readers into the slow path.
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..9cce79e89320
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
+#include "cgroup-internal.h"
+#include <linux/sched/cputime.h>
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+        return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
+ * cpu_stat->updated_children list.  See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+        struct cgroup *parent;
+        unsigned long flags;
+        /*
+         * Speculative already-on-list test.  This may race leading to
+         * temporary inaccuracies, which is fine.
+         *
+         * Because @parent's updated_children is terminated with @parent
+         * instead of NULL, we can tell whether @cgrp is on the list by
+         * testing the next pointer for NULL.
+         */
+        if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+                return;
+        raw_spin_lock_irqsave(cpu_lock, flags);
+        /* put @cgrp and all ancestors on the corresponding updated lists */
+        for (parent = cgroup_parent(cgrp); parent;
+             cgrp = parent, parent = cgroup_parent(cgrp)) {
+                struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+                struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+                /*
+                 * Both additions and removals are bottom-up.  If a cgroup
+                 * is already in the tree, all ancestors are.
+                 */
+                if (cstat->updated_next)
+                        break;
+                cstat->updated_next = pcstat->updated_children;
+                pcstat->updated_children = cgrp;
+        }
+        raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
+ * the traversal and %NULL return indicates the end.  During traversal,
+ * each returned cgroup is unlinked from the tree.  Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+                                                  struct cgroup *root, int cpu)
+{
+        struct cgroup_cpu_stat *cstat;
+        struct cgroup *parent;
+        if (pos == root)
+                return NULL;
+        /*
+         * We're gonna walk down to the first leaf and visit/remove it.  We
+         * can pick whatever unvisited node as the starting point.
+         */
+        if (!pos)
+                pos = root;
+        else
+                pos = cgroup_parent(pos);
+        /* walk down to the first leaf */
+        while (true) {
+                cstat = cgroup_cpu_stat(pos, cpu);
+                if (cstat->updated_children == pos)
+                        break;
+                pos = cstat->updated_children;
+        }
+        /*
+         * Unlink @pos from the tree.  As the updated_children list is
+         * singly linked, we have to walk it to find the removal point.
+         * However, due to the way we traverse, @pos will be the first
+         * child in most cases. The only exception is @root.
+         */
+        parent = cgroup_parent(pos);
+        if (parent && cstat->updated_next) {
+                struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+                struct cgroup_cpu_stat *ncstat;
+                struct cgroup **nextp;
+                nextp = &pcstat->updated_children;
+                while (true) {
+                        ncstat = cgroup_cpu_stat(*nextp, cpu);
+                        if (*nextp == pos)
+                                break;
+                        WARN_ON_ONCE(*nextp == parent);
+                        nextp = &ncstat->updated_next;
+                }
+                *nextp = cstat->updated_next;
+                cstat->updated_next = NULL;
+        }
+        return pos;
+}
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+                                   struct cgroup_stat *src_stat)
+{
+        dst_stat->cputime.utime += src_stat->cputime.utime;
+        dst_stat->cputime.stime += src_stat->cputime.stime;
+        dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+        struct cgroup *parent = cgroup_parent(cgrp);
+        struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+        struct task_cputime *last_cputime = &cstat->last_cputime;
+        struct task_cputime cputime;
+        struct cgroup_stat delta;
+        unsigned seq;
+        lockdep_assert_held(&cgroup_stat_mutex);
+        /* fetch the current per-cpu values */
+        do {
+                seq = __u64_stats_fetch_begin(&cstat->sync);
+                cputime = cstat->cputime;
+        } while (__u64_stats_fetch_retry(&cstat->sync, seq));
+        /* accumulate the deltas to propgate */
+        delta.cputime.utime = cputime.utime - last_cputime->utime;
+        delta.cputime.stime = cputime.stime - last_cputime->stime;
+        delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+                                         last_cputime->sum_exec_runtime;
+        *last_cputime = cputime;
+        /* transfer the pending stat into delta */
+        cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+        memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+        /* propagate delta into the global stat and the parent's pending */
+        cgroup_stat_accumulate(&cgrp->stat, &delta);
+        if (parent)
+                cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+        int cpu;
+        lockdep_assert_held(&cgroup_stat_mutex);
+        for_each_possible_cpu(cpu) {
+                raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+                struct cgroup *pos = NULL;
+                raw_spin_lock_irq(cpu_lock);
+                while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+                        cgroup_cpu_stat_flush_one(pos, cpu);
+                raw_spin_unlock_irq(cpu_lock);
+        }
+}
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards.  After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+        mutex_lock(&cgroup_stat_mutex);
+        cgroup_stat_flush_locked(cgrp);
+        mutex_unlock(&cgroup_stat_mutex);
+}
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+        struct cgroup_cpu_stat *cstat;
+        cstat = get_cpu_ptr(cgrp->cpu_stat);
+        u64_stats_update_begin(&cstat->sync);
+        return cstat;
+}
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+                                        struct cgroup_cpu_stat *cstat)
+{
+        u64_stats_update_end(&cstat->sync);
+        cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+        put_cpu_ptr(cstat);
+}
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+        struct cgroup_cpu_stat *cstat;
+        cstat = cgroup_cpu_stat_account_begin(cgrp);
+        cstat->cputime.sum_exec_runtime += delta_exec;
+        cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+                                    enum cpu_usage_stat index, u64 delta_exec)
+{
+        struct cgroup_cpu_stat *cstat;
+        cstat = cgroup_cpu_stat_account_begin(cgrp);
+        switch (index) {
+        case CPUTIME_USER:
+        case CPUTIME_NICE:
+                cstat->cputime.utime += delta_exec;
+                break;
+        case CPUTIME_SYSTEM:
+        case CPUTIME_IRQ:
+        case CPUTIME_SOFTIRQ:
+                cstat->cputime.stime += delta_exec;
+                break;
+        default:
+                break;
+        }
+        cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
+{
+        struct cgroup *cgrp = seq_css(seq)->cgroup;
+        u64 usage, utime, stime;
+        if (!cgroup_parent(cgrp))
+                return;
+        mutex_lock(&cgroup_stat_mutex);
+        cgroup_stat_flush_locked(cgrp);
+        usage = cgrp->stat.cputime.sum_exec_runtime;
+        cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+                       &utime, &stime);
+        mutex_unlock(&cgroup_stat_mutex);
+        do_div(usage, NSEC_PER_USEC);
+        do_div(utime, NSEC_PER_USEC);
+        do_div(stime, NSEC_PER_USEC);
+        seq_printf(seq, "%susage_usec %llu\n"
+                   "%suser_usec %llu\n"
+                   "%ssystem_usec %llu\n",
+                   prefix, usage, prefix, utime, prefix, stime);
+}
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+        int cpu;
+        /* the root cgrp has cpu_stat preallocated */
+        if (!cgrp->cpu_stat) {
+                cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+                if (!cgrp->cpu_stat)
+                        return -ENOMEM;
+        }
+        /* ->updated_children list is self terminated */
+        for_each_possible_cpu(cpu)
+                cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+        prev_cputime_init(&cgrp->stat.prev_cputime);
+        return 0;
+}
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+        int cpu;
+        cgroup_stat_flush(cgrp);
+        /* sanity check */
+        for_each_possible_cpu(cpu) {
+                struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+                if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+                    WARN_ON_ONCE(cstat->updated_next))
+                        return;
+        }
+        free_percpu(cgrp->cpu_stat);
+        cgrp->cpu_stat = NULL;
+}
+void __init cgroup_stat_boot(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+        BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
author	Tejun Heo <tj@kernel.org>	2017-09-25 11:12:05 -0400
committer	Tejun Heo <tj@kernel.org>	2017-09-25 11:12:05 -0400
commit	041cd640b2f3c5607171c59d8712b503659d21f7 (patch)
tree	2979112393aefa10e23245ae95f481763280dd6f /kernel/cgroup
parent	d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 (diff)

diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ce693ccb8c58..0acee616e06c 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
1	obj-y := cgroup.o namespace.o cgroup-v1.o	1	obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
2		2
3	obj-$(CONFIG_CGROUP_FREEZER) += freezer.o	3	obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
4	obj-$(CONFIG_CGROUP_PIDS) += pids.o	4	obj-$(CONFIG_CGROUP_PIDS) += pids.o


diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5151ff256c29..fa642c99586a 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h
@@ -200,6 +200,14 @@ int cgroup_show_path(struct seq_file sf, struct kernfs_node kf_node,
200	int cgroup_task_count(const struct cgroup *cgrp);	200	int cgroup_task_count(const struct cgroup *cgrp);
201		201
202	/*	202	/*
		203	* stat.c
		204	*/
		205	void cgroup_stat_flush(struct cgroup *cgrp);
		206	int cgroup_stat_init(struct cgroup *cgrp);
		207	void cgroup_stat_exit(struct cgroup *cgrp);
		208	void cgroup_stat_boot(void);
		209
		210	/*
203	* namespace.c	211	* namespace.c
204	*/	212	*/
205	extern const struct proc_ns_operations cgroupns_operations;	213	extern const struct proc_ns_operations cgroupns_operations;


diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..d036625556c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
142	};	142	};
143	#undef SUBSYS	143	#undef SUBSYS
144		144
		145	static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
		146
145	/*	147	/*
146	* The default hierarchy, reserved for the subsystems that are otherwise	148	* The default hierarchy, reserved for the subsystems that are otherwise
147	* unattached - it never has more than a single cgroup, and all tasks are	149	* unattached - it never has more than a single cgroup, and all tasks are
148	* part of that cgroup.	150	* part of that cgroup.
149	*/	151	*/
150	struct cgroup_root cgrp_dfl_root;	152	struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
151	EXPORT_SYMBOL_GPL(cgrp_dfl_root);	153	EXPORT_SYMBOL_GPL(cgrp_dfl_root);
152		154
153	/*	155	/*
@@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file seq, void v)
3301	seq_printf(seq, "nr_dying_descendants %d\n",	3303	seq_printf(seq, "nr_dying_descendants %d\n",
3302	cgroup->nr_dying_descendants);	3304	cgroup->nr_dying_descendants);
3303		3305
		3306	cgroup_stat_show_cputime(seq, "cpu.");
		3307
3304	return 0;	3308	return 0;
3305	}	3309	}
3306		3310
@@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work)
4471	*/	4475	*/
4472	cgroup_put(cgroup_parent(cgrp));	4476	cgroup_put(cgroup_parent(cgrp));
4473	kernfs_put(cgrp->kn);	4477	kernfs_put(cgrp->kn);
		4478	if (cgroup_on_dfl(cgrp))
		4479	cgroup_stat_exit(cgrp);
4474	kfree(cgrp);	4480	kfree(cgrp);
4475	} else {	4481	} else {
4476	/*	4482	/*
@@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work)
4515	/* cgroup release path */	4521	/* cgroup release path */
4516	trace_cgroup_release(cgrp);	4522	trace_cgroup_release(cgrp);
4517		4523
		4524	if (cgroup_on_dfl(cgrp))
		4525	cgroup_stat_flush(cgrp);
		4526
4518	for (tcgrp = cgroup_parent(cgrp); tcgrp;	4527	for (tcgrp = cgroup_parent(cgrp); tcgrp;
4519	tcgrp = cgroup_parent(tcgrp))	4528	tcgrp = cgroup_parent(tcgrp))
4520	tcgrp->nr_dying_descendants--;	4529	tcgrp->nr_dying_descendants--;
@@ -4698,6 +4707,12 @@ static struct cgroup cgroup_create(struct cgroup parent)
4698	if (ret)	4707	if (ret)
4699	goto out_free_cgrp;	4708	goto out_free_cgrp;
4700		4709
		4710	if (cgroup_on_dfl(parent)) {
		4711	ret = cgroup_stat_init(cgrp);
		4712	if (ret)
		4713	goto out_cancel_ref;
		4714	}
		4715
4701	/*	4716	/*
4702	* Temporarily set the pointer to NULL, so idr_find() won't return	4717	* Temporarily set the pointer to NULL, so idr_find() won't return
4703	* a half-baked cgroup.	4718	* a half-baked cgroup.
@@ -4705,7 +4720,7 @@ static struct cgroup cgroup_create(struct cgroup parent)
4705	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);	4720	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4706	if (cgrp->id < 0) {	4721	if (cgrp->id < 0) {
4707	ret = -ENOMEM;	4722	ret = -ENOMEM;
4708	goto out_cancel_ref;	4723	goto out_stat_exit;
4709	}	4724	}
4710		4725
4711	init_cgroup_housekeeping(cgrp);	4726	init_cgroup_housekeeping(cgrp);
@@ -4754,6 +4769,9 @@ static struct cgroup cgroup_create(struct cgroup parent)
4754		4769
4755	return cgrp;	4770	return cgrp;
4756		4771
		4772	out_stat_exit:
		4773	if (cgroup_on_dfl(parent))
		4774	cgroup_stat_exit(cgrp);
4757	out_cancel_ref:	4775	out_cancel_ref:
4758	percpu_ref_exit(&cgrp->self.refcnt);	4776	percpu_ref_exit(&cgrp->self.refcnt);
4759	out_free_cgrp:	4777	out_free_cgrp:
@@ -5148,6 +5166,8 @@ int __init cgroup_init(void)
5148	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));	5166	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5149	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));	5167	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5150		5168
		5169	cgroup_stat_boot();
		5170
5151	/*	5171	/*
5152	* The latency of the synchronize_sched() is too high for cgroups,	5172	* The latency of the synchronize_sched() is too high for cgroups,
5153	* avoid it at the cost of forcing all readers into the slow path.	5173	* avoid it at the cost of forcing all readers into the slow path.


diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c new file mode 100644 index 000000000000..9cce79e89320 --- /dev/null +++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
		1	#include "cgroup-internal.h"
		2
		3	#include <linux/sched/cputime.h>
		4
		5	static DEFINE_MUTEX(cgroup_stat_mutex);
		6	static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
		7
		8	static struct cgroup_cpu_stat cgroup_cpu_stat(struct cgroup cgrp, int cpu)
		9	{
		10	return per_cpu_ptr(cgrp->cpu_stat, cpu);
		11	}
		12
		13	/**
		14	* cgroup_cpu_stat_updated - keep track of updated cpu_stat
		15	* @cgrp: target cgroup
		16	* @cpu: cpu on which cpu_stat was updated
		17	*
		18	* @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
		19	* cpu_stat->updated_children list. See the comment on top of
		20	* cgroup_cpu_stat definition for details.
		21	*/
		22	static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
		23	{
		24	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
		25	struct cgroup *parent;
		26	unsigned long flags;
		27
		28	/*
		29	* Speculative already-on-list test. This may race leading to
		30	* temporary inaccuracies, which is fine.
		31	*
		32	* Because @parent's updated_children is terminated with @parent
		33	* instead of NULL, we can tell whether @cgrp is on the list by
		34	* testing the next pointer for NULL.
		35	*/
		36	if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
		37	return;
		38
		39	raw_spin_lock_irqsave(cpu_lock, flags);
		40
		41	/* put @cgrp and all ancestors on the corresponding updated lists */
		42	for (parent = cgroup_parent(cgrp); parent;
		43	cgrp = parent, parent = cgroup_parent(cgrp)) {
		44	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
		45	struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
		46
		47	/*
		48	* Both additions and removals are bottom-up. If a cgroup
		49	* is already in the tree, all ancestors are.
		50	*/
		51	if (cstat->updated_next)
		52	break;
		53
		54	cstat->updated_next = pcstat->updated_children;
		55	pcstat->updated_children = cgrp;
		56	}
		57
		58	raw_spin_unlock_irqrestore(cpu_lock, flags);
		59	}
		60
		61	/**
		62	* cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
		63	* @pos: current position
		64	* @root: root of the tree to traversal
		65	* @cpu: target cpu
		66	*
		67	* Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
		68	* the traversal and %NULL return indicates the end. During traversal,
		69	* each returned cgroup is unlinked from the tree. Must be called with the
		70	* matching cgroup_cpu_stat_lock held.
		71	*
		72	* The only ordering guarantee is that, for a parent and a child pair
		73	* covered by a given traversal, if a child is visited, its parent is
		74	* guaranteed to be visited afterwards.
		75	*/
		76	static struct cgroup cgroup_cpu_stat_pop_updated(struct cgroup pos,
		77	struct cgroup *root, int cpu)
		78	{
		79	struct cgroup_cpu_stat *cstat;
		80	struct cgroup *parent;
		81
		82	if (pos == root)
		83	return NULL;
		84
		85	/*
		86	* We're gonna walk down to the first leaf and visit/remove it. We
		87	* can pick whatever unvisited node as the starting point.
		88	*/
		89	if (!pos)
		90	pos = root;
		91	else
		92	pos = cgroup_parent(pos);
		93
		94	/* walk down to the first leaf */
		95	while (true) {
		96	cstat = cgroup_cpu_stat(pos, cpu);
		97	if (cstat->updated_children == pos)
		98	break;
		99	pos = cstat->updated_children;
		100	}
		101
		102	/*
		103	* Unlink @pos from the tree. As the updated_children list is
		104	* singly linked, we have to walk it to find the removal point.
		105	* However, due to the way we traverse, @pos will be the first
		106	* child in most cases. The only exception is @root.
		107	*/
		108	parent = cgroup_parent(pos);
		109	if (parent && cstat->updated_next) {
		110	struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
		111	struct cgroup_cpu_stat *ncstat;
		112	struct cgroup **nextp;
		113
		114	nextp = &pcstat->updated_children;
		115	while (true) {
		116	ncstat = cgroup_cpu_stat(*nextp, cpu);
		117	if (*nextp == pos)
		118	break;
		119
		120	WARN_ON_ONCE(*nextp == parent);
		121	nextp = &ncstat->updated_next;
		122	}
		123
		124	*nextp = cstat->updated_next;
		125	cstat->updated_next = NULL;
		126	}
		127
		128	return pos;
		129	}
		130
		131	static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
		132	struct cgroup_stat *src_stat)
		133	{
		134	dst_stat->cputime.utime += src_stat->cputime.utime;
		135	dst_stat->cputime.stime += src_stat->cputime.stime;
		136	dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
		137	}
		138
		139	static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
		140	{
		141	struct cgroup *parent = cgroup_parent(cgrp);
		142	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
		143	struct task_cputime *last_cputime = &cstat->last_cputime;
		144	struct task_cputime cputime;
		145	struct cgroup_stat delta;
		146	unsigned seq;
		147
		148	lockdep_assert_held(&cgroup_stat_mutex);
		149
		150	/* fetch the current per-cpu values */
		151	do {
		152	seq = __u64_stats_fetch_begin(&cstat->sync);
		153	cputime = cstat->cputime;
		154	} while (__u64_stats_fetch_retry(&cstat->sync, seq));
		155
		156	/* accumulate the deltas to propgate */
		157	delta.cputime.utime = cputime.utime - last_cputime->utime;
		158	delta.cputime.stime = cputime.stime - last_cputime->stime;
		159	delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
		160	last_cputime->sum_exec_runtime;
		161	*last_cputime = cputime;
		162
		163	/* transfer the pending stat into delta */
		164	cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
		165	memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
		166
		167	/* propagate delta into the global stat and the parent's pending */
		168	cgroup_stat_accumulate(&cgrp->stat, &delta);
		169	if (parent)
		170	cgroup_stat_accumulate(&parent->pending_stat, &delta);
		171	}
		172
		173	/* see cgroup_stat_flush() */
		174	static void cgroup_stat_flush_locked(struct cgroup *cgrp)
		175	{
		176	int cpu;
		177
		178	lockdep_assert_held(&cgroup_stat_mutex);
		179
		180	for_each_possible_cpu(cpu) {
		181	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
		182	struct cgroup *pos = NULL;
		183
		184	raw_spin_lock_irq(cpu_lock);
		185	while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
		186	cgroup_cpu_stat_flush_one(pos, cpu);
		187	raw_spin_unlock_irq(cpu_lock);
		188	}
		189	}
		190
		191	/**
		192	* cgroup_stat_flush - flush stats in @cgrp's subtree
		193	* @cgrp: target cgroup
		194	*
		195	* Collect all per-cpu stats in @cgrp's subtree into the global counters
		196	* and propagate them upwards. After this function returns, all cgroups in
		197	* the subtree have up-to-date ->stat.
		198	*
		199	* This also gets all cgroups in the subtree including @cgrp off the
		200	* ->updated_children lists.
		201	*/
		202	void cgroup_stat_flush(struct cgroup *cgrp)
		203	{
		204	mutex_lock(&cgroup_stat_mutex);
		205	cgroup_stat_flush_locked(cgrp);
		206	mutex_unlock(&cgroup_stat_mutex);
		207	}
		208
		209	static struct cgroup_cpu_stat cgroup_cpu_stat_account_begin(struct cgroup cgrp)
		210	{
		211	struct cgroup_cpu_stat *cstat;
		212
		213	cstat = get_cpu_ptr(cgrp->cpu_stat);
		214	u64_stats_update_begin(&cstat->sync);
		215	return cstat;
		216	}
		217
		218	static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
		219	struct cgroup_cpu_stat *cstat)
		220	{
		221	u64_stats_update_end(&cstat->sync);
		222	cgroup_cpu_stat_updated(cgrp, smp_processor_id());
		223	put_cpu_ptr(cstat);
		224	}
		225
		226	void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
		227	{
		228	struct cgroup_cpu_stat *cstat;
		229
		230	cstat = cgroup_cpu_stat_account_begin(cgrp);
		231	cstat->cputime.sum_exec_runtime += delta_exec;
		232	cgroup_cpu_stat_account_end(cgrp, cstat);
		233	}
		234
		235	void __cgroup_account_cputime_field(struct cgroup *cgrp,
		236	enum cpu_usage_stat index, u64 delta_exec)
		237	{
		238	struct cgroup_cpu_stat *cstat;
		239
		240	cstat = cgroup_cpu_stat_account_begin(cgrp);
		241
		242	switch (index) {
		243	case CPUTIME_USER:
		244	case CPUTIME_NICE:
		245	cstat->cputime.utime += delta_exec;
		246	break;
		247	case CPUTIME_SYSTEM:
		248	case CPUTIME_IRQ:
		249	case CPUTIME_SOFTIRQ:
		250	cstat->cputime.stime += delta_exec;
		251	break;
		252	default:
		253	break;
		254	}
		255
		256	cgroup_cpu_stat_account_end(cgrp, cstat);
		257	}
		258
		259	void cgroup_stat_show_cputime(struct seq_file seq, const char prefix)
		260	{
		261	struct cgroup *cgrp = seq_css(seq)->cgroup;
		262	u64 usage, utime, stime;
		263
		264	if (!cgroup_parent(cgrp))
		265	return;
		266
		267	mutex_lock(&cgroup_stat_mutex);
		268
		269	cgroup_stat_flush_locked(cgrp);
		270
		271	usage = cgrp->stat.cputime.sum_exec_runtime;
		272	cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
		273	&utime, &stime);
		274
		275	mutex_unlock(&cgroup_stat_mutex);
		276
		277	do_div(usage, NSEC_PER_USEC);
		278	do_div(utime, NSEC_PER_USEC);
		279	do_div(stime, NSEC_PER_USEC);
		280
		281	seq_printf(seq, "%susage_usec %llu\n"
		282	"%suser_usec %llu\n"
		283	"%ssystem_usec %llu\n",
		284	prefix, usage, prefix, utime, prefix, stime);
		285	}
		286
		287	int cgroup_stat_init(struct cgroup *cgrp)
		288	{
		289	int cpu;
		290
		291	/* the root cgrp has cpu_stat preallocated */
		292	if (!cgrp->cpu_stat) {
		293	cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
		294	if (!cgrp->cpu_stat)
		295	return -ENOMEM;
		296	}
		297
		298	/* ->updated_children list is self terminated */
		299	for_each_possible_cpu(cpu)
		300	cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
		301
		302	prev_cputime_init(&cgrp->stat.prev_cputime);
		303
		304	return 0;
		305	}
		306
		307	void cgroup_stat_exit(struct cgroup *cgrp)
		308	{
		309	int cpu;
		310
		311	cgroup_stat_flush(cgrp);
		312
		313	/* sanity check */
		314	for_each_possible_cpu(cpu) {
		315	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
		316
		317	if (WARN_ON_ONCE(cstat->updated_children != cgrp) \|\|
		318	WARN_ON_ONCE(cstat->updated_next))
		319	return;
		320	}
		321
		322	free_percpu(cgrp->cpu_stat);
		323	cgrp->cpu_stat = NULL;
		324	}
		325
		326	void __init cgroup_stat_boot(void)
		327	{
		328	int cpu;
		329
		330	for_each_possible_cpu(cpu)
		331	raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
		332
		333	BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
		334	}