8 files changed, 554 insertions, 417 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index dc5b70449dc6..c0e68f903011 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -105,6 +105,8 @@ enum {
 struct cgroup_file {
        /* do not access any fields from outside cgroup core */
        struct kernfs_node *kn;
+        unsigned long notified_at;
+        struct timer_list notify_timer;
 };
 /*
@@ -128,6 +130,9 @@ struct cgroup_subsys_state {
        struct list_head sibling;
        struct list_head children;
+        /* flush target list anchored at cgrp->rstat_css_list */
+        struct list_head rstat_css_node;
        /*
         * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
         * matching css can be looked up using css_from_id().
@@ -256,12 +261,16 @@ struct css_set {
        struct rcu_head rcu_head;
 };
+struct cgroup_base_stat {
+        struct task_cputime cputime;
+};
 /*
- * cgroup basic resource usage statistics.  Accounting is done per-cpu in
+ * rstat - cgroup scalable recursive statistics.  Accounting is done
- * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
+ * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
- * reads.
+ * hierarchy on reads.
 *
- * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
+ * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
 * linked into the updated tree.  On the following read, propagation only
 * considers and consumes the updated tree.  This makes reading O(the
 * number of descendants which have been active since last read) instead of
@@ -271,20 +280,24 @@ struct css_set {
 * aren't active and stat may be read frequently.  The combination can
 * become very expensive.  By propagating selectively, increasing reading
 * frequency decreases the cost of each read.
+ *
+ * This struct hosts both the fields which implement the above -
+ * updated_children and updated_next - and the fields which track basic
+ * resource statistics on top of it - bsync, bstat and last_bstat.
 */
-struct cgroup_cpu_stat {
+struct cgroup_rstat_cpu {
        /*
-         * ->sync protects all the current counters.  These are the only
+         * ->bsync protects ->bstat.  These are the only fields which get
-         * fields which get updated in the hot path.
+         * updated in the hot path.
         */
-        struct u64_stats_sync sync;
+        struct u64_stats_sync bsync;
-        struct task_cputime cputime;
+        struct cgroup_base_stat bstat;
        /*
         * Snapshots at the last reading.  These are used to calculate the
         * deltas to propagate to the global counters.
         */
-        struct task_cputime last_cputime;
+        struct cgroup_base_stat last_bstat;
        /*
         * Child cgroups with stat updates on this cpu since the last read
@@ -295,18 +308,12 @@ struct cgroup_cpu_stat {
         * to the cgroup makes it unnecessary for each per-cpu struct to
         * point back to the associated cgroup.
         *
-         * Protected by per-cpu cgroup_cpu_stat_lock.
+         * Protected by per-cpu cgroup_rstat_cpu_lock.
         */
        struct cgroup *updated_children;        /* terminated by self cgroup */
        struct cgroup *updated_next;            /* NULL iff not on the list */
 };
-struct cgroup_stat {
-        /* per-cpu statistics are collected into the folowing global counters */
-        struct task_cputime cputime;
-        struct prev_cputime prev_cputime;
-};
 struct cgroup {
        /* self css with NULL ->ss, points back to this cgroup */
        struct cgroup_subsys_state self;
@@ -406,10 +413,14 @@ struct cgroup {
         */
        struct cgroup *dom_cgrp;
+        /* per-cpu recursive resource statistics */
+        struct cgroup_rstat_cpu __percpu *rstat_cpu;
+        struct list_head rstat_css_list;
        /* cgroup basic resource statistics */
-        struct cgroup_cpu_stat __percpu *cpu_stat;
+        struct cgroup_base_stat pending_bstat;  /* pending from children */
-        struct cgroup_stat pending_stat;        /* pending from children */
+        struct cgroup_base_stat bstat;
-        struct cgroup_stat stat;
+        struct prev_cputime prev_cputime;       /* for printing out cputime */
        /*
         * list of pidlists, up to two for each namespace (one for procs, one
@@ -570,6 +581,7 @@ struct cgroup_subsys {
        void (*css_released)(struct cgroup_subsys_state *css);
        void (*css_free)(struct cgroup_subsys_state *css);
        void (*css_reset)(struct cgroup_subsys_state *css);
+        void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
        int (*css_extra_stat_show)(struct seq_file *seq,
                                   struct cgroup_subsys_state *css);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0abb86..c9fdf6f57913 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -690,11 +690,19 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
        char *buf, size_t buflen) {}
 #endif /* !CONFIG_CGROUPS */
+#ifdef CONFIG_CGROUPS
 /*
- * Basic resource stats.
+ * cgroup scalable recursive statistics.
 */
-#ifdef CONFIG_CGROUPS
+void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
+void cgroup_rstat_flush(struct cgroup *cgrp);
+void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
+void cgroup_rstat_flush_hold(struct cgroup *cgrp);
+void cgroup_rstat_flush_release(void);
+/*
+ * Basic resource stats.
+ */
 #ifdef CONFIG_CGROUP_CPUACCT
 void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 2be89a003185..bfcdae896122 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 0808a33d16d3..77ff1cd6a252 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 int cgroup_task_count(const struct cgroup *cgrp);
 /*
- * stat.c
+ * rstat.c
 */
-void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_rstat_init(struct cgroup *cgrp);
-int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_rstat_exit(struct cgroup *cgrp);
-void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_rstat_boot(void);
-void cgroup_stat_show_cputime(struct seq_file *seq);
+void cgroup_base_stat_cputime_show(struct seq_file *seq);
-void cgroup_stat_boot(void);
 /*
 * namespace.c
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 12883656e63e..acb66713f9b6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
 #include <linux/file.h>
+#include <linux/sched/cputime.h>
 #include <net/sock.h>
 #define CREATE_TRACE_POINTS
@@ -61,6 +62,8 @@
 #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
                                         MAX_CFTYPE_NAME + 2)
+/* let's not notify more than 100 times per second */
+#define CGROUP_FILE_NOTIFY_MIN_INTV     DIV_ROUND_UP(HZ, 100)
 /*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
@@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
-static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
 /*
 * The default hierarchy, reserved for the subsystems that are otherwise
 * unattached - it never has more than a single cgroup, and all tasks are
 * part of that cgroup.
 */
-struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
+struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 /*
@@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                spin_lock_irq(&cgroup_file_kn_lock);
                cfile->kn = NULL;
                spin_unlock_irq(&cgroup_file_kn_lock);
+                del_timer_sync(&cfile->notify_timer);
        }
        kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
        css->flags &= ~CSS_VISIBLE;
-        list_for_each_entry(cfts, &css->ss->cfts, node)
+        if (!css->ss) {
+                if (cgroup_on_dfl(cgrp))
+                        cfts = cgroup_base_files;
+                else
+                        cfts = cgroup1_base_files;
                cgroup_addrm_files(css, cgrp, cfts, false);
+        } else {
+                list_for_each_entry(cfts, &css->ss->cfts, node)
+                        cgroup_addrm_files(css, cgrp, cfts, false);
+        }
 }
 /**
@@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
                else
                        cfts = cgroup1_base_files;
-                return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+                ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
-        }
+                if (ret < 0)
+                        return ret;
-        list_for_each_entry(cfts, &css->ss->cfts, node) {
+        } else {
-                ret = cgroup_addrm_files(css, cgrp, cfts, true);
+                list_for_each_entry(cfts, &css->ss->cfts, node) {
-                if (ret < 0) {
+                        ret = cgroup_addrm_files(css, cgrp, cfts, true);
-                        failed_cfts = cfts;
+                        if (ret < 0) {
-                        goto err;
+                                failed_cfts = cfts;
+                                goto err;
+                        }
                }
        }
@@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
-        spin_lock_irq(&css_set_lock);
-        if (use_task_css_set_links)
-                goto out_unlock;
-        use_task_css_set_links = true;
        /*
         * We need tasklist_lock because RCU is not safe against
         * while_each_thread(). Besides, a forking task that has passed
@@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void)
         * tasklist if we walk through it with RCU.
         */
        read_lock(&tasklist_lock);
+        spin_lock_irq(&css_set_lock);
+        if (use_task_css_set_links)
+                goto out_unlock;
+        use_task_css_set_links = true;
        do_each_thread(g, p) {
                WARN_ON_ONCE(!list_empty(&p->cg_list) ||
                             task_css_set(p) != &init_css_set);
@@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void)
                }
                spin_unlock(&p->sighand->siglock);
        } while_each_thread(g, p);
-        read_unlock(&tasklist_lock);
 out_unlock:
        spin_unlock_irq(&css_set_lock);
+        read_unlock(&tasklist_lock);
 }
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        cgrp->dom_cgrp = cgrp;
        cgrp->max_descendants = INT_MAX;
        cgrp->max_depth = INT_MAX;
+        INIT_LIST_HEAD(&cgrp->rstat_css_list);
+        prev_cputime_init(&cgrp->prev_cputime);
        for_each_subsys(ss, ssid)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -3381,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
        struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
        int ret = 0;
-        cgroup_stat_show_cputime(seq);
+        cgroup_base_stat_cputime_show(seq);
 #ifdef CONFIG_CGROUP_SCHED
        ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
 #endif
@@ -3521,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
        return kernfs_setattr(kn, &iattr);
 }
+static void cgroup_file_notify_timer(struct timer_list *timer)
+{
+        cgroup_file_notify(container_of(timer, struct cgroup_file,
+                                        notify_timer));
+}
 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
                           struct cftype *cft)
 {
@@ -3547,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
        if (cft->file_offset) {
                struct cgroup_file *cfile = (void *)css + cft->file_offset;
+                timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
                spin_lock_irq(&cgroup_file_kn_lock);
                cfile->kn = kn;
                spin_unlock_irq(&cgroup_file_kn_lock);
@@ -3796,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile)
        unsigned long flags;
        spin_lock_irqsave(&cgroup_file_kn_lock, flags);
-        if (cfile->kn)
+        if (cfile->kn) {
-                kernfs_notify(cfile->kn);
+                unsigned long last = cfile->notified_at;
+                unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+                if (time_in_range(jiffies, last, next)) {
+                        timer_reduce(&cfile->notify_timer, next);
+                } else {
+                        kernfs_notify(cfile->kn);
+                        cfile->notified_at = jiffies;
+                }
+        }
        spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
 }
@@ -4560,7 +4595,7 @@ static void css_free_rwork_fn(struct work_struct *work)
                        cgroup_put(cgroup_parent(cgrp));
                        kernfs_put(cgrp->kn);
                        if (cgroup_on_dfl(cgrp))
-                                cgroup_stat_exit(cgrp);
+                                cgroup_rstat_exit(cgrp);
                        kfree(cgrp);
                } else {
                        /*
@@ -4587,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work)
        if (ss) {
                /* css release path */
+                if (!list_empty(&css->rstat_css_node)) {
+                        cgroup_rstat_flush(cgrp);
+                        list_del_rcu(&css->rstat_css_node);
+                }
                cgroup_idr_replace(&ss->css_idr, NULL, css->id);
                if (ss->css_released)
                        ss->css_released(css);
@@ -4597,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work)
                trace_cgroup_release(cgrp);
                if (cgroup_on_dfl(cgrp))
-                        cgroup_stat_flush(cgrp);
+                        cgroup_rstat_flush(cgrp);
                for (tcgrp = cgroup_parent(cgrp); tcgrp;
                     tcgrp = cgroup_parent(tcgrp))
@@ -4648,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
        css->id = -1;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
+        INIT_LIST_HEAD(&css->rstat_css_node);
        css->serial_nr = css_serial_nr_next++;
        atomic_set(&css->online_cnt, 0);
@@ -4656,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
                css_get(css->parent);
        }
+        if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+                list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
        BUG_ON(cgroup_css(cgrp, ss));
 }
@@ -4757,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 err_list_del:
        list_del_rcu(&css->sibling);
 err_free_css:
+        list_del_rcu(&css->rstat_css_node);
        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
        return ERR_PTR(err);
@@ -4785,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
                goto out_free_cgrp;
        if (cgroup_on_dfl(parent)) {
-                ret = cgroup_stat_init(cgrp);
+                ret = cgroup_rstat_init(cgrp);
                if (ret)
                        goto out_cancel_ref;
        }
@@ -4850,7 +4895,7 @@ out_idr_free:
        cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_stat_exit:
        if (cgroup_on_dfl(parent))
-                cgroup_stat_exit(cgrp);
+                cgroup_rstat_exit(cgrp);
 out_cancel_ref:
        percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5090,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        for_each_css(css, ssid, cgrp)
                kill_css(css);
-        /*
+        /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
-         * Remove @cgrp directory along with the base files.  @cgrp has an
+        css_clear_dir(&cgrp->self);
-         * extra ref on its kn.
-         */
        kernfs_remove(cgrp->kn);
        if (parent && cgroup_is_threaded(cgrp))
@@ -5245,7 +5288,7 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
-        cgroup_stat_boot();
+        cgroup_rstat_boot();
        /*
         * The latency of the synchronize_sched() is too high for cgroups,
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index defad3c5e7dc..d3bbb757ee49 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device);
 static int parse_resource(char *c, int *intval)
 {
        substring_t argstr;
-        const char **table = &rdmacg_resource_names[0];
        char *name, *value = c;
        size_t len;
-        int ret, i = 0;
+        int ret, i;
        name = strsep(&value, "=");
        if (!name || !value)
                return -EINVAL;
-        len = strlen(value);
+        i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
+        if (i < 0)
+                return i;
-        for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+        len = strlen(value);
-                if (strcmp(table[i], name))
-                        continue;
-                argstr.from = value;
+        argstr.from = value;
-                argstr.to = value + len;
+        argstr.to = value + len;
-                ret = match_int(&argstr, intval);
+        ret = match_int(&argstr, intval);
-                if (ret >= 0) {
+        if (ret >= 0) {
-                        if (*intval < 0)
+                if (*intval < 0)
-                                break;
+                        return -EINVAL;
-                        return i;
+                return i;
-                }
+        }
-                if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+        if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
-                        *intval = S32_MAX;
+                *intval = S32_MAX;
-                        return i;
+                return i;
-                }
-                break;
        }
        return -EINVAL;
 }
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
new file mode 100644
index 000000000000..d503d1a9007c
--- /dev/null
+++ b/kernel/cgroup/rstat.c
@@ -0,0 +1,416 @@
+#include "cgroup-internal.h"
+#include <linux/sched/cputime.h>
+static DEFINE_SPINLOCK(cgroup_rstat_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
+static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
+{
+        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
+}
+/**
+ * cgroup_rstat_updated - keep track of updated rstat_cpu
+ * @cgrp: target cgroup
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
+ * rstat_cpu->updated_children list.  See the comment on top of
+ * cgroup_rstat_cpu definition for details.
+ */
+void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+{
+        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+        struct cgroup *parent;
+        unsigned long flags;
+        /* nothing to do for root */
+        if (!cgroup_parent(cgrp))
+                return;
+        /*
+         * Paired with the one in cgroup_rstat_cpu_pop_upated().  Either we
+         * see NULL updated_next or they see our updated stat.
+         */
+        smp_mb();
+        /*
+         * Because @parent's updated_children is terminated with @parent
+         * instead of NULL, we can tell whether @cgrp is on the list by
+         * testing the next pointer for NULL.
+         */
+        if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+                return;
+        raw_spin_lock_irqsave(cpu_lock, flags);
+        /* put @cgrp and all ancestors on the corresponding updated lists */
+        for (parent = cgroup_parent(cgrp); parent;
+             cgrp = parent, parent = cgroup_parent(cgrp)) {
+                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+                /*
+                 * Both additions and removals are bottom-up.  If a cgroup
+                 * is already in the tree, all ancestors are.
+                 */
+                if (rstatc->updated_next)
+                        break;
+                rstatc->updated_next = prstatc->updated_children;
+                prstatc->updated_children = cgrp;
+        }
+        raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
+/**
+ * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
+ * the traversal and %NULL return indicates the end.  During traversal,
+ * each returned cgroup is unlinked from the tree.  Must be called with the
+ * matching cgroup_rstat_cpu_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
+                                                   struct cgroup *root, int cpu)
+{
+        struct cgroup_rstat_cpu *rstatc;
+        struct cgroup *parent;
+        if (pos == root)
+                return NULL;
+        /*
+         * We're gonna walk down to the first leaf and visit/remove it.  We
+         * can pick whatever unvisited node as the starting point.
+         */
+        if (!pos)
+                pos = root;
+        else
+                pos = cgroup_parent(pos);
+        /* walk down to the first leaf */
+        while (true) {
+                rstatc = cgroup_rstat_cpu(pos, cpu);
+                if (rstatc->updated_children == pos)
+                        break;
+                pos = rstatc->updated_children;
+        }
+        /*
+         * Unlink @pos from the tree.  As the updated_children list is
+         * singly linked, we have to walk it to find the removal point.
+         * However, due to the way we traverse, @pos will be the first
+         * child in most cases. The only exception is @root.
+         */
+        parent = cgroup_parent(pos);
+        if (parent && rstatc->updated_next) {
+                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+                struct cgroup_rstat_cpu *nrstatc;
+                struct cgroup **nextp;
+                nextp = &prstatc->updated_children;
+                while (true) {
+                        nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+                        if (*nextp == pos)
+                                break;
+                        WARN_ON_ONCE(*nextp == parent);
+                        nextp = &nrstatc->updated_next;
+                }
+                *nextp = rstatc->updated_next;
+                rstatc->updated_next = NULL;
+                /*
+                 * Paired with the one in cgroup_rstat_cpu_updated().
+                 * Either they see NULL updated_next or we see their
+                 * updated stat.
+                 */
+                smp_mb();
+        }
+        return pos;
+}
+/* see cgroup_rstat_flush() */
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+        __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
+{
+        int cpu;
+        lockdep_assert_held(&cgroup_rstat_lock);
+        for_each_possible_cpu(cpu) {
+                raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
+                                                       cpu);
+                struct cgroup *pos = NULL;
+                raw_spin_lock(cpu_lock);
+                while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+                        struct cgroup_subsys_state *css;
+                        cgroup_base_stat_flush(pos, cpu);
+                        rcu_read_lock();
+                        list_for_each_entry_rcu(css, &pos->rstat_css_list,
+                                                rstat_css_node)
+                                css->ss->css_rstat_flush(css, cpu);
+                        rcu_read_unlock();
+                }
+                raw_spin_unlock(cpu_lock);
+                /* if @may_sleep, play nice and yield if necessary */
+                if (may_sleep && (need_resched() ||
+                                  spin_needbreak(&cgroup_rstat_lock))) {
+                        spin_unlock_irq(&cgroup_rstat_lock);
+                        if (!cond_resched())
+                                cpu_relax();
+                        spin_lock_irq(&cgroup_rstat_lock);
+                }
+        }
+}
+/**
+ * cgroup_rstat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards.  After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ *
+ * This function may block.
+ */
+void cgroup_rstat_flush(struct cgroup *cgrp)
+{
+        might_sleep();
+        spin_lock_irq(&cgroup_rstat_lock);
+        cgroup_rstat_flush_locked(cgrp, true);
+        spin_unlock_irq(&cgroup_rstat_lock);
+}
+/**
+ * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
+ * @cgrp: target cgroup
+ *
+ * This function can be called from any context.
+ */
+void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cgroup_rstat_lock, flags);
+        cgroup_rstat_flush_locked(cgrp, false);
+        spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
+}
+/**
+ * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * @cgrp: target cgroup
+ *
+ * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
+ * paired with cgroup_rstat_flush_release().
+ *
+ * This function may block.
+ */
+void cgroup_rstat_flush_hold(struct cgroup *cgrp)
+        __acquires(&cgroup_rstat_lock)
+{
+        might_sleep();
+        spin_lock_irq(&cgroup_rstat_lock);
+        cgroup_rstat_flush_locked(cgrp, true);
+}
+/**
+ * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ */
+void cgroup_rstat_flush_release(void)
+        __releases(&cgroup_rstat_lock)
+{
+        spin_unlock_irq(&cgroup_rstat_lock);
+}
+int cgroup_rstat_init(struct cgroup *cgrp)
+{
+        int cpu;
+        /* the root cgrp has rstat_cpu preallocated */
+        if (!cgrp->rstat_cpu) {
+                cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
+                if (!cgrp->rstat_cpu)
+                        return -ENOMEM;
+        }
+        /* ->updated_children list is self terminated */
+        for_each_possible_cpu(cpu) {
+                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+                rstatc->updated_children = cgrp;
+                u64_stats_init(&rstatc->bsync);
+        }
+        return 0;
+}
+void cgroup_rstat_exit(struct cgroup *cgrp)
+{
+        int cpu;
+        cgroup_rstat_flush(cgrp);
+        /* sanity check */
+        for_each_possible_cpu(cpu) {
+                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+                if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
+                    WARN_ON_ONCE(rstatc->updated_next))
+                        return;
+        }
+        free_percpu(cgrp->rstat_cpu);
+        cgrp->rstat_cpu = NULL;
+}
+void __init cgroup_rstat_boot(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+        BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
+}
+/*
+ * Functions for cgroup basic resource statistics implemented on top of
+ * rstat.
+ */
+static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat,
+                                        struct cgroup_base_stat *src_bstat)
+{
+        dst_bstat->cputime.utime += src_bstat->cputime.utime;
+        dst_bstat->cputime.stime += src_bstat->cputime.stime;
+        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+}
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
+{
+        struct cgroup *parent = cgroup_parent(cgrp);
+        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+        struct task_cputime *last_cputime = &rstatc->last_bstat.cputime;
+        struct task_cputime cputime;
+        struct cgroup_base_stat delta;
+        unsigned seq;
+        /* fetch the current per-cpu values */
+        do {
+                seq = __u64_stats_fetch_begin(&rstatc->bsync);
+                cputime = rstatc->bstat.cputime;
+        } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+        /* calculate the delta to propgate */
+        delta.cputime.utime = cputime.utime - last_cputime->utime;
+        delta.cputime.stime = cputime.stime - last_cputime->stime;
+        delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+                                         last_cputime->sum_exec_runtime;
+        *last_cputime = cputime;
+        /* transfer the pending stat into delta */
+        cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat);
+        memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat));
+        /* propagate delta into the global stat and the parent's pending */
+        cgroup_base_stat_accumulate(&cgrp->bstat, &delta);
+        if (parent)
+                cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
+}
+static struct cgroup_rstat_cpu *
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+{
+        struct cgroup_rstat_cpu *rstatc;
+        rstatc = get_cpu_ptr(cgrp->rstat_cpu);
+        u64_stats_update_begin(&rstatc->bsync);
+        return rstatc;
+}
+static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
+                                                 struct cgroup_rstat_cpu *rstatc)
+{
+        u64_stats_update_end(&rstatc->bsync);
+        cgroup_rstat_updated(cgrp, smp_processor_id());
+        put_cpu_ptr(rstatc);
+}
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+        struct cgroup_rstat_cpu *rstatc;
+        rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+        rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
+        cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+}
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+                                    enum cpu_usage_stat index, u64 delta_exec)
+{
+        struct cgroup_rstat_cpu *rstatc;
+        rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+        switch (index) {
+        case CPUTIME_USER:
+        case CPUTIME_NICE:
+                rstatc->bstat.cputime.utime += delta_exec;
+                break;
+        case CPUTIME_SYSTEM:
+        case CPUTIME_IRQ:
+        case CPUTIME_SOFTIRQ:
+                rstatc->bstat.cputime.stime += delta_exec;
+                break;
+        default:
+                break;
+        }
+        cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+}
+void cgroup_base_stat_cputime_show(struct seq_file *seq)
+{
+        struct cgroup *cgrp = seq_css(seq)->cgroup;
+        u64 usage, utime, stime;
+        if (!cgroup_parent(cgrp))
+                return;
+        cgroup_rstat_flush_hold(cgrp);
+        usage = cgrp->bstat.cputime.sum_exec_runtime;
+        cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
+        cgroup_rstat_flush_release();
+        do_div(usage, NSEC_PER_USEC);
+        do_div(utime, NSEC_PER_USEC);
+        do_div(stime, NSEC_PER_USEC);
+        seq_printf(seq, "usage_usec %llu\n"
+                   "user_usec %llu\n"
+                   "system_usec %llu\n",
+                   usage, utime, stime);
+}
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
deleted file mode 100644
index 1e111dd455c4..000000000000
--- a/kernel/cgroup/stat.c
+++ /dev/null
@@ -1,338 +0,0 @@
-#include "cgroup-internal.h"
-#include <linux/sched/cputime.h>
-static DEFINE_MUTEX(cgroup_stat_mutex);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
-static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
-{
-        return per_cpu_ptr(cgrp->cpu_stat, cpu);
-}
-/**
- * cgroup_cpu_stat_updated - keep track of updated cpu_stat
- * @cgrp: target cgroup
- * @cpu: cpu on which cpu_stat was updated
- *
- * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
- * cpu_stat->updated_children list.  See the comment on top of
- * cgroup_cpu_stat definition for details.
- */
-static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
-{
-        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
-        struct cgroup *parent;
-        unsigned long flags;
-        /*
-         * Speculative already-on-list test.  This may race leading to
-         * temporary inaccuracies, which is fine.
-         *
-         * Because @parent's updated_children is terminated with @parent
-         * instead of NULL, we can tell whether @cgrp is on the list by
-         * testing the next pointer for NULL.
-         */
-        if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
-                return;
-        raw_spin_lock_irqsave(cpu_lock, flags);
-        /* put @cgrp and all ancestors on the corresponding updated lists */
-        for (parent = cgroup_parent(cgrp); parent;
-             cgrp = parent, parent = cgroup_parent(cgrp)) {
-                struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-                struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
-                /*
-                 * Both additions and removals are bottom-up.  If a cgroup
-                 * is already in the tree, all ancestors are.
-                 */
-                if (cstat->updated_next)
-                        break;
-                cstat->updated_next = pcstat->updated_children;
-                pcstat->updated_children = cgrp;
-        }
-        raw_spin_unlock_irqrestore(cpu_lock, flags);
-}
-/**
- * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
- * @pos: current position
- * @root: root of the tree to traversal
- * @cpu: target cpu
- *
- * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
- * the traversal and %NULL return indicates the end.  During traversal,
- * each returned cgroup is unlinked from the tree.  Must be called with the
- * matching cgroup_cpu_stat_lock held.
- *
- * The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
- */
-static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
-                                                  struct cgroup *root, int cpu)
-{
-        struct cgroup_cpu_stat *cstat;
-        struct cgroup *parent;
-        if (pos == root)
-                return NULL;
-        /*
-         * We're gonna walk down to the first leaf and visit/remove it.  We
-         * can pick whatever unvisited node as the starting point.
-         */
-        if (!pos)
-                pos = root;
-        else
-                pos = cgroup_parent(pos);
-        /* walk down to the first leaf */
-        while (true) {
-                cstat = cgroup_cpu_stat(pos, cpu);
-                if (cstat->updated_children == pos)
-                        break;
-                pos = cstat->updated_children;
-        }
-        /*
-         * Unlink @pos from the tree.  As the updated_children list is
-         * singly linked, we have to walk it to find the removal point.
-         * However, due to the way we traverse, @pos will be the first
-         * child in most cases. The only exception is @root.
-         */
-        parent = cgroup_parent(pos);
-        if (parent && cstat->updated_next) {
-                struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
-                struct cgroup_cpu_stat *ncstat;
-                struct cgroup **nextp;
-                nextp = &pcstat->updated_children;
-                while (true) {
-                        ncstat = cgroup_cpu_stat(*nextp, cpu);
-                        if (*nextp == pos)
-                                break;
-                        WARN_ON_ONCE(*nextp == parent);
-                        nextp = &ncstat->updated_next;
-                }
-                *nextp = cstat->updated_next;
-                cstat->updated_next = NULL;
-        }
-        return pos;
-}
-static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
-                                   struct cgroup_stat *src_stat)
-{
-        dst_stat->cputime.utime += src_stat->cputime.utime;
-        dst_stat->cputime.stime += src_stat->cputime.stime;
-        dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
-}
-static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
-{
-        struct cgroup *parent = cgroup_parent(cgrp);
-        struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-        struct task_cputime *last_cputime = &cstat->last_cputime;
-        struct task_cputime cputime;
-        struct cgroup_stat delta;
-        unsigned seq;
-        lockdep_assert_held(&cgroup_stat_mutex);
-        /* fetch the current per-cpu values */
-        do {
-                seq = __u64_stats_fetch_begin(&cstat->sync);
-                cputime = cstat->cputime;
-        } while (__u64_stats_fetch_retry(&cstat->sync, seq));
-        /* accumulate the deltas to propgate */
-        delta.cputime.utime = cputime.utime - last_cputime->utime;
-        delta.cputime.stime = cputime.stime - last_cputime->stime;
-        delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
-                                         last_cputime->sum_exec_runtime;
-        *last_cputime = cputime;
-        /* transfer the pending stat into delta */
-        cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
-        memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
-        /* propagate delta into the global stat and the parent's pending */
-        cgroup_stat_accumulate(&cgrp->stat, &delta);
-        if (parent)
-                cgroup_stat_accumulate(&parent->pending_stat, &delta);
-}
-/* see cgroup_stat_flush() */
-static void cgroup_stat_flush_locked(struct cgroup *cgrp)
-{
-        int cpu;
-        lockdep_assert_held(&cgroup_stat_mutex);
-        for_each_possible_cpu(cpu) {
-                raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
-                struct cgroup *pos = NULL;
-                raw_spin_lock_irq(cpu_lock);
-                while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
-                        cgroup_cpu_stat_flush_one(pos, cpu);
-                raw_spin_unlock_irq(cpu_lock);
-        }
-}
-/**
- * cgroup_stat_flush - flush stats in @cgrp's subtree
- * @cgrp: target cgroup
- *
- * Collect all per-cpu stats in @cgrp's subtree into the global counters
- * and propagate them upwards.  After this function returns, all cgroups in
- * the subtree have up-to-date ->stat.
- *
- * This also gets all cgroups in the subtree including @cgrp off the
- * ->updated_children lists.
- */
-void cgroup_stat_flush(struct cgroup *cgrp)
-{
-        mutex_lock(&cgroup_stat_mutex);
-        cgroup_stat_flush_locked(cgrp);
-        mutex_unlock(&cgroup_stat_mutex);
-}
-static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
-{
-        struct cgroup_cpu_stat *cstat;
-        cstat = get_cpu_ptr(cgrp->cpu_stat);
-        u64_stats_update_begin(&cstat->sync);
-        return cstat;
-}
-static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
-                                        struct cgroup_cpu_stat *cstat)
-{
-        u64_stats_update_end(&cstat->sync);
-        cgroup_cpu_stat_updated(cgrp, smp_processor_id());
-        put_cpu_ptr(cstat);
-}
-void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
-{
-        struct cgroup_cpu_stat *cstat;
-        cstat = cgroup_cpu_stat_account_begin(cgrp);
-        cstat->cputime.sum_exec_runtime += delta_exec;
-        cgroup_cpu_stat_account_end(cgrp, cstat);
-}
-void __cgroup_account_cputime_field(struct cgroup *cgrp,
-                                    enum cpu_usage_stat index, u64 delta_exec)
-{
-        struct cgroup_cpu_stat *cstat;
-        cstat = cgroup_cpu_stat_account_begin(cgrp);
-        switch (index) {
-        case CPUTIME_USER:
-        case CPUTIME_NICE:
-                cstat->cputime.utime += delta_exec;
-                break;
-        case CPUTIME_SYSTEM:
-        case CPUTIME_IRQ:
-        case CPUTIME_SOFTIRQ:
-                cstat->cputime.stime += delta_exec;
-                break;
-        default:
-                break;
-        }
-        cgroup_cpu_stat_account_end(cgrp, cstat);
-}
-void cgroup_stat_show_cputime(struct seq_file *seq)
-{
-        struct cgroup *cgrp = seq_css(seq)->cgroup;
-        u64 usage, utime, stime;
-        if (!cgroup_parent(cgrp))
-                return;
-        mutex_lock(&cgroup_stat_mutex);
-        cgroup_stat_flush_locked(cgrp);
-        usage = cgrp->stat.cputime.sum_exec_runtime;
-        cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
-                       &utime, &stime);
-        mutex_unlock(&cgroup_stat_mutex);
-        do_div(usage, NSEC_PER_USEC);
-        do_div(utime, NSEC_PER_USEC);
-        do_div(stime, NSEC_PER_USEC);
-        seq_printf(seq, "usage_usec %llu\n"
-                   "user_usec %llu\n"
-                   "system_usec %llu\n",
-                   usage, utime, stime);
-}
-int cgroup_stat_init(struct cgroup *cgrp)
-{
-        int cpu;
-        /* the root cgrp has cpu_stat preallocated */
-        if (!cgrp->cpu_stat) {
-                cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
-                if (!cgrp->cpu_stat)
-                        return -ENOMEM;
-        }
-        /* ->updated_children list is self terminated */
-        for_each_possible_cpu(cpu) {
-                struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-                cstat->updated_children = cgrp;
-                u64_stats_init(&cstat->sync);
-        }
-        prev_cputime_init(&cgrp->stat.prev_cputime);
-        return 0;
-}
-void cgroup_stat_exit(struct cgroup *cgrp)
-{
-        int cpu;
-        cgroup_stat_flush(cgrp);
-        /* sanity check */
-        for_each_possible_cpu(cpu) {
-                struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-                if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
-                    WARN_ON_ONCE(cstat->updated_next))
-                        return;
-        }
-        free_percpu(cgrp->cpu_stat);
-        cgrp->cpu_stat = NULL;
-}
-void __init cgroup_stat_boot(void)
-{
-        int cpu;
-        for_each_possible_cpu(cpu)
-                raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
-        BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
-}