aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/cgroup-defs.h52
-rw-r--r--include/linux/cgroup.h12
-rw-r--r--kernel/cgroup/Makefile2
-rw-r--r--kernel/cgroup/cgroup-internal.h11
-rw-r--r--kernel/cgroup/cgroup.c105
-rw-r--r--kernel/cgroup/rdma.c35
-rw-r--r--kernel/cgroup/rstat.c416
-rw-r--r--kernel/cgroup/stat.c338
8 files changed, 554 insertions, 417 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index dc5b70449dc6..c0e68f903011 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -105,6 +105,8 @@ enum {
105struct cgroup_file { 105struct cgroup_file {
106 /* do not access any fields from outside cgroup core */ 106 /* do not access any fields from outside cgroup core */
107 struct kernfs_node *kn; 107 struct kernfs_node *kn;
108 unsigned long notified_at;
109 struct timer_list notify_timer;
108}; 110};
109 111
110/* 112/*
@@ -128,6 +130,9 @@ struct cgroup_subsys_state {
128 struct list_head sibling; 130 struct list_head sibling;
129 struct list_head children; 131 struct list_head children;
130 132
133 /* flush target list anchored at cgrp->rstat_css_list */
134 struct list_head rstat_css_node;
135
131 /* 136 /*
132 * PI: Subsys-unique ID. 0 is unused and root is always 1. The 137 * PI: Subsys-unique ID. 0 is unused and root is always 1. The
133 * matching css can be looked up using css_from_id(). 138 * matching css can be looked up using css_from_id().
@@ -256,12 +261,16 @@ struct css_set {
256 struct rcu_head rcu_head; 261 struct rcu_head rcu_head;
257}; 262};
258 263
264struct cgroup_base_stat {
265 struct task_cputime cputime;
266};
267
259/* 268/*
260 * cgroup basic resource usage statistics. Accounting is done per-cpu in 269 * rstat - cgroup scalable recursive statistics. Accounting is done
261 * cgroup_cpu_stat which is then lazily propagated up the hierarchy on 270 * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
262 * reads. 271 * hierarchy on reads.
263 * 272 *
264 * When a stat gets updated, the cgroup_cpu_stat and its ancestors are 273 * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
265 * linked into the updated tree. On the following read, propagation only 274 * linked into the updated tree. On the following read, propagation only
266 * considers and consumes the updated tree. This makes reading O(the 275 * considers and consumes the updated tree. This makes reading O(the
267 * number of descendants which have been active since last read) instead of 276 * number of descendants which have been active since last read) instead of
@@ -271,20 +280,24 @@ struct css_set {
271 * aren't active and stat may be read frequently. The combination can 280 * aren't active and stat may be read frequently. The combination can
272 * become very expensive. By propagating selectively, increasing reading 281 * become very expensive. By propagating selectively, increasing reading
273 * frequency decreases the cost of each read. 282 * frequency decreases the cost of each read.
283 *
284 * This struct hosts both the fields which implement the above -
285 * updated_children and updated_next - and the fields which track basic
286 * resource statistics on top of it - bsync, bstat and last_bstat.
274 */ 287 */
275struct cgroup_cpu_stat { 288struct cgroup_rstat_cpu {
276 /* 289 /*
277 * ->sync protects all the current counters. These are the only 290 * ->bsync protects ->bstat. These are the only fields which get
278 * fields which get updated in the hot path. 291 * updated in the hot path.
279 */ 292 */
280 struct u64_stats_sync sync; 293 struct u64_stats_sync bsync;
281 struct task_cputime cputime; 294 struct cgroup_base_stat bstat;
282 295
283 /* 296 /*
284 * Snapshots at the last reading. These are used to calculate the 297 * Snapshots at the last reading. These are used to calculate the
285 * deltas to propagate to the global counters. 298 * deltas to propagate to the global counters.
286 */ 299 */
287 struct task_cputime last_cputime; 300 struct cgroup_base_stat last_bstat;
288 301
289 /* 302 /*
290 * Child cgroups with stat updates on this cpu since the last read 303 * Child cgroups with stat updates on this cpu since the last read
@@ -295,18 +308,12 @@ struct cgroup_cpu_stat {
295 * to the cgroup makes it unnecessary for each per-cpu struct to 308 * to the cgroup makes it unnecessary for each per-cpu struct to
296 * point back to the associated cgroup. 309 * point back to the associated cgroup.
297 * 310 *
298 * Protected by per-cpu cgroup_cpu_stat_lock. 311 * Protected by per-cpu cgroup_rstat_cpu_lock.
299 */ 312 */
300 struct cgroup *updated_children; /* terminated by self cgroup */ 313 struct cgroup *updated_children; /* terminated by self cgroup */
301 struct cgroup *updated_next; /* NULL iff not on the list */ 314 struct cgroup *updated_next; /* NULL iff not on the list */
302}; 315};
303 316
304struct cgroup_stat {
305 /* per-cpu statistics are collected into the folowing global counters */
306 struct task_cputime cputime;
307 struct prev_cputime prev_cputime;
308};
309
310struct cgroup { 317struct cgroup {
311 /* self css with NULL ->ss, points back to this cgroup */ 318 /* self css with NULL ->ss, points back to this cgroup */
312 struct cgroup_subsys_state self; 319 struct cgroup_subsys_state self;
@@ -406,10 +413,14 @@ struct cgroup {
406 */ 413 */
407 struct cgroup *dom_cgrp; 414 struct cgroup *dom_cgrp;
408 415
416 /* per-cpu recursive resource statistics */
417 struct cgroup_rstat_cpu __percpu *rstat_cpu;
418 struct list_head rstat_css_list;
419
409 /* cgroup basic resource statistics */ 420 /* cgroup basic resource statistics */
410 struct cgroup_cpu_stat __percpu *cpu_stat; 421 struct cgroup_base_stat pending_bstat; /* pending from children */
411 struct cgroup_stat pending_stat; /* pending from children */ 422 struct cgroup_base_stat bstat;
412 struct cgroup_stat stat; 423 struct prev_cputime prev_cputime; /* for printing out cputime */
413 424
414 /* 425 /*
415 * list of pidlists, up to two for each namespace (one for procs, one 426 * list of pidlists, up to two for each namespace (one for procs, one
@@ -570,6 +581,7 @@ struct cgroup_subsys {
570 void (*css_released)(struct cgroup_subsys_state *css); 581 void (*css_released)(struct cgroup_subsys_state *css);
571 void (*css_free)(struct cgroup_subsys_state *css); 582 void (*css_free)(struct cgroup_subsys_state *css);
572 void (*css_reset)(struct cgroup_subsys_state *css); 583 void (*css_reset)(struct cgroup_subsys_state *css);
584 void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
573 int (*css_extra_stat_show)(struct seq_file *seq, 585 int (*css_extra_stat_show)(struct seq_file *seq,
574 struct cgroup_subsys_state *css); 586 struct cgroup_subsys_state *css);
575 587
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0abb86..c9fdf6f57913 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -690,11 +690,19 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
690 char *buf, size_t buflen) {} 690 char *buf, size_t buflen) {}
691#endif /* !CONFIG_CGROUPS */ 691#endif /* !CONFIG_CGROUPS */
692 692
693#ifdef CONFIG_CGROUPS
693/* 694/*
694 * Basic resource stats. 695 * cgroup scalable recursive statistics.
695 */ 696 */
696#ifdef CONFIG_CGROUPS 697void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
698void cgroup_rstat_flush(struct cgroup *cgrp);
699void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
700void cgroup_rstat_flush_hold(struct cgroup *cgrp);
701void cgroup_rstat_flush_release(void);
697 702
703/*
704 * Basic resource stats.
705 */
698#ifdef CONFIG_CGROUP_CPUACCT 706#ifdef CONFIG_CGROUP_CPUACCT
699void cpuacct_charge(struct task_struct *tsk, u64 cputime); 707void cpuacct_charge(struct task_struct *tsk, u64 cputime);
700void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); 708void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 2be89a003185..bfcdae896122 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2obj-y := cgroup.o stat.o namespace.o cgroup-v1.o 2obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
3 3
4obj-$(CONFIG_CGROUP_FREEZER) += freezer.o 4obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
5obj-$(CONFIG_CGROUP_PIDS) += pids.o 5obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 0808a33d16d3..77ff1cd6a252 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
201int cgroup_task_count(const struct cgroup *cgrp); 201int cgroup_task_count(const struct cgroup *cgrp);
202 202
203/* 203/*
204 * stat.c 204 * rstat.c
205 */ 205 */
206void cgroup_stat_flush(struct cgroup *cgrp); 206int cgroup_rstat_init(struct cgroup *cgrp);
207int cgroup_stat_init(struct cgroup *cgrp); 207void cgroup_rstat_exit(struct cgroup *cgrp);
208void cgroup_stat_exit(struct cgroup *cgrp); 208void cgroup_rstat_boot(void);
209void cgroup_stat_show_cputime(struct seq_file *seq); 209void cgroup_base_stat_cputime_show(struct seq_file *seq);
210void cgroup_stat_boot(void);
211 210
212/* 211/*
213 * namespace.c 212 * namespace.c
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 12883656e63e..acb66713f9b6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
54#include <linux/proc_ns.h> 54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h> 55#include <linux/nsproxy.h>
56#include <linux/file.h> 56#include <linux/file.h>
57#include <linux/sched/cputime.h>
57#include <net/sock.h> 58#include <net/sock.h>
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
@@ -61,6 +62,8 @@
61 62
62#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ 63#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
63 MAX_CFTYPE_NAME + 2) 64 MAX_CFTYPE_NAME + 2)
65/* let's not notify more than 100 times per second */
66#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
64 67
65/* 68/*
66 * cgroup_mutex is the master lock. Any modification to cgroup or its 69 * cgroup_mutex is the master lock. Any modification to cgroup or its
@@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
142}; 145};
143#undef SUBSYS 146#undef SUBSYS
144 147
145static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); 148static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
146 149
147/* 150/*
148 * The default hierarchy, reserved for the subsystems that are otherwise 151 * The default hierarchy, reserved for the subsystems that are otherwise
149 * unattached - it never has more than a single cgroup, and all tasks are 152 * unattached - it never has more than a single cgroup, and all tasks are
150 * part of that cgroup. 153 * part of that cgroup.
151 */ 154 */
152struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; 155struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
153EXPORT_SYMBOL_GPL(cgrp_dfl_root); 156EXPORT_SYMBOL_GPL(cgrp_dfl_root);
154 157
155/* 158/*
@@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1554 spin_lock_irq(&cgroup_file_kn_lock); 1557 spin_lock_irq(&cgroup_file_kn_lock);
1555 cfile->kn = NULL; 1558 cfile->kn = NULL;
1556 spin_unlock_irq(&cgroup_file_kn_lock); 1559 spin_unlock_irq(&cgroup_file_kn_lock);
1560
1561 del_timer_sync(&cfile->notify_timer);
1557 } 1562 }
1558 1563
1559 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); 1564 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
1573 1578
1574 css->flags &= ~CSS_VISIBLE; 1579 css->flags &= ~CSS_VISIBLE;
1575 1580
1576 list_for_each_entry(cfts, &css->ss->cfts, node) 1581 if (!css->ss) {
1582 if (cgroup_on_dfl(cgrp))
1583 cfts = cgroup_base_files;
1584 else
1585 cfts = cgroup1_base_files;
1586
1577 cgroup_addrm_files(css, cgrp, cfts, false); 1587 cgroup_addrm_files(css, cgrp, cfts, false);
1588 } else {
1589 list_for_each_entry(cfts, &css->ss->cfts, node)
1590 cgroup_addrm_files(css, cgrp, cfts, false);
1591 }
1578} 1592}
1579 1593
1580/** 1594/**
@@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
1598 else 1612 else
1599 cfts = cgroup1_base_files; 1613 cfts = cgroup1_base_files;
1600 1614
1601 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); 1615 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1602 } 1616 if (ret < 0)
1603 1617 return ret;
1604 list_for_each_entry(cfts, &css->ss->cfts, node) { 1618 } else {
1605 ret = cgroup_addrm_files(css, cgrp, cfts, true); 1619 list_for_each_entry(cfts, &css->ss->cfts, node) {
1606 if (ret < 0) { 1620 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1607 failed_cfts = cfts; 1621 if (ret < 0) {
1608 goto err; 1622 failed_cfts = cfts;
1623 goto err;
1624 }
1609 } 1625 }
1610 } 1626 }
1611 1627
@@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void)
1782{ 1798{
1783 struct task_struct *p, *g; 1799 struct task_struct *p, *g;
1784 1800
1785 spin_lock_irq(&css_set_lock);
1786
1787 if (use_task_css_set_links)
1788 goto out_unlock;
1789
1790 use_task_css_set_links = true;
1791
1792 /* 1801 /*
1793 * We need tasklist_lock because RCU is not safe against 1802 * We need tasklist_lock because RCU is not safe against
1794 * while_each_thread(). Besides, a forking task that has passed 1803 * while_each_thread(). Besides, a forking task that has passed
@@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void)
1797 * tasklist if we walk through it with RCU. 1806 * tasklist if we walk through it with RCU.
1798 */ 1807 */
1799 read_lock(&tasklist_lock); 1808 read_lock(&tasklist_lock);
1809 spin_lock_irq(&css_set_lock);
1810
1811 if (use_task_css_set_links)
1812 goto out_unlock;
1813
1814 use_task_css_set_links = true;
1815
1800 do_each_thread(g, p) { 1816 do_each_thread(g, p) {
1801 WARN_ON_ONCE(!list_empty(&p->cg_list) || 1817 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1802 task_css_set(p) != &init_css_set); 1818 task_css_set(p) != &init_css_set);
@@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void)
1824 } 1840 }
1825 spin_unlock(&p->sighand->siglock); 1841 spin_unlock(&p->sighand->siglock);
1826 } while_each_thread(g, p); 1842 } while_each_thread(g, p);
1827 read_unlock(&tasklist_lock);
1828out_unlock: 1843out_unlock:
1829 spin_unlock_irq(&css_set_lock); 1844 spin_unlock_irq(&css_set_lock);
1845 read_unlock(&tasklist_lock);
1830} 1846}
1831 1847
1832static void init_cgroup_housekeeping(struct cgroup *cgrp) 1848static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1844 cgrp->dom_cgrp = cgrp; 1860 cgrp->dom_cgrp = cgrp;
1845 cgrp->max_descendants = INT_MAX; 1861 cgrp->max_descendants = INT_MAX;
1846 cgrp->max_depth = INT_MAX; 1862 cgrp->max_depth = INT_MAX;
1863 INIT_LIST_HEAD(&cgrp->rstat_css_list);
1864 prev_cputime_init(&cgrp->prev_cputime);
1847 1865
1848 for_each_subsys(ss, ssid) 1866 for_each_subsys(ss, ssid)
1849 INIT_LIST_HEAD(&cgrp->e_csets[ssid]); 1867 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -3381,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
3381 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; 3399 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3382 int ret = 0; 3400 int ret = 0;
3383 3401
3384 cgroup_stat_show_cputime(seq); 3402 cgroup_base_stat_cputime_show(seq);
3385#ifdef CONFIG_CGROUP_SCHED 3403#ifdef CONFIG_CGROUP_SCHED
3386 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); 3404 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3387#endif 3405#endif
@@ -3521,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3521 return kernfs_setattr(kn, &iattr); 3539 return kernfs_setattr(kn, &iattr);
3522} 3540}
3523 3541
3542static void cgroup_file_notify_timer(struct timer_list *timer)
3543{
3544 cgroup_file_notify(container_of(timer, struct cgroup_file,
3545 notify_timer));
3546}
3547
3524static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, 3548static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3525 struct cftype *cft) 3549 struct cftype *cft)
3526{ 3550{
@@ -3547,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3547 if (cft->file_offset) { 3571 if (cft->file_offset) {
3548 struct cgroup_file *cfile = (void *)css + cft->file_offset; 3572 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3549 3573
3574 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
3575
3550 spin_lock_irq(&cgroup_file_kn_lock); 3576 spin_lock_irq(&cgroup_file_kn_lock);
3551 cfile->kn = kn; 3577 cfile->kn = kn;
3552 spin_unlock_irq(&cgroup_file_kn_lock); 3578 spin_unlock_irq(&cgroup_file_kn_lock);
@@ -3796,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile)
3796 unsigned long flags; 3822 unsigned long flags;
3797 3823
3798 spin_lock_irqsave(&cgroup_file_kn_lock, flags); 3824 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
3799 if (cfile->kn) 3825 if (cfile->kn) {
3800 kernfs_notify(cfile->kn); 3826 unsigned long last = cfile->notified_at;
3827 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
3828
3829 if (time_in_range(jiffies, last, next)) {
3830 timer_reduce(&cfile->notify_timer, next);
3831 } else {
3832 kernfs_notify(cfile->kn);
3833 cfile->notified_at = jiffies;
3834 }
3835 }
3801 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); 3836 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
3802} 3837}
3803 3838
@@ -4560,7 +4595,7 @@ static void css_free_rwork_fn(struct work_struct *work)
4560 cgroup_put(cgroup_parent(cgrp)); 4595 cgroup_put(cgroup_parent(cgrp));
4561 kernfs_put(cgrp->kn); 4596 kernfs_put(cgrp->kn);
4562 if (cgroup_on_dfl(cgrp)) 4597 if (cgroup_on_dfl(cgrp))
4563 cgroup_stat_exit(cgrp); 4598 cgroup_rstat_exit(cgrp);
4564 kfree(cgrp); 4599 kfree(cgrp);
4565 } else { 4600 } else {
4566 /* 4601 /*
@@ -4587,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work)
4587 4622
4588 if (ss) { 4623 if (ss) {
4589 /* css release path */ 4624 /* css release path */
4625 if (!list_empty(&css->rstat_css_node)) {
4626 cgroup_rstat_flush(cgrp);
4627 list_del_rcu(&css->rstat_css_node);
4628 }
4629
4590 cgroup_idr_replace(&ss->css_idr, NULL, css->id); 4630 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4591 if (ss->css_released) 4631 if (ss->css_released)
4592 ss->css_released(css); 4632 ss->css_released(css);
@@ -4597,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work)
4597 trace_cgroup_release(cgrp); 4637 trace_cgroup_release(cgrp);
4598 4638
4599 if (cgroup_on_dfl(cgrp)) 4639 if (cgroup_on_dfl(cgrp))
4600 cgroup_stat_flush(cgrp); 4640 cgroup_rstat_flush(cgrp);
4601 4641
4602 for (tcgrp = cgroup_parent(cgrp); tcgrp; 4642 for (tcgrp = cgroup_parent(cgrp); tcgrp;
4603 tcgrp = cgroup_parent(tcgrp)) 4643 tcgrp = cgroup_parent(tcgrp))
@@ -4648,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
4648 css->id = -1; 4688 css->id = -1;
4649 INIT_LIST_HEAD(&css->sibling); 4689 INIT_LIST_HEAD(&css->sibling);
4650 INIT_LIST_HEAD(&css->children); 4690 INIT_LIST_HEAD(&css->children);
4691 INIT_LIST_HEAD(&css->rstat_css_node);
4651 css->serial_nr = css_serial_nr_next++; 4692 css->serial_nr = css_serial_nr_next++;
4652 atomic_set(&css->online_cnt, 0); 4693 atomic_set(&css->online_cnt, 0);
4653 4694
@@ -4656,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
4656 css_get(css->parent); 4697 css_get(css->parent);
4657 } 4698 }
4658 4699
4700 if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
4701 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
4702
4659 BUG_ON(cgroup_css(cgrp, ss)); 4703 BUG_ON(cgroup_css(cgrp, ss));
4660} 4704}
4661 4705
@@ -4757,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
4757err_list_del: 4801err_list_del:
4758 list_del_rcu(&css->sibling); 4802 list_del_rcu(&css->sibling);
4759err_free_css: 4803err_free_css:
4804 list_del_rcu(&css->rstat_css_node);
4760 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); 4805 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
4761 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); 4806 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
4762 return ERR_PTR(err); 4807 return ERR_PTR(err);
@@ -4785,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4785 goto out_free_cgrp; 4830 goto out_free_cgrp;
4786 4831
4787 if (cgroup_on_dfl(parent)) { 4832 if (cgroup_on_dfl(parent)) {
4788 ret = cgroup_stat_init(cgrp); 4833 ret = cgroup_rstat_init(cgrp);
4789 if (ret) 4834 if (ret)
4790 goto out_cancel_ref; 4835 goto out_cancel_ref;
4791 } 4836 }
@@ -4850,7 +4895,7 @@ out_idr_free:
4850 cgroup_idr_remove(&root->cgroup_idr, cgrp->id); 4895 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4851out_stat_exit: 4896out_stat_exit:
4852 if (cgroup_on_dfl(parent)) 4897 if (cgroup_on_dfl(parent))
4853 cgroup_stat_exit(cgrp); 4898 cgroup_rstat_exit(cgrp);
4854out_cancel_ref: 4899out_cancel_ref:
4855 percpu_ref_exit(&cgrp->self.refcnt); 4900 percpu_ref_exit(&cgrp->self.refcnt);
4856out_free_cgrp: 4901out_free_cgrp:
@@ -5090,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
5090 for_each_css(css, ssid, cgrp) 5135 for_each_css(css, ssid, cgrp)
5091 kill_css(css); 5136 kill_css(css);
5092 5137
5093 /* 5138 /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
5094 * Remove @cgrp directory along with the base files. @cgrp has an 5139 css_clear_dir(&cgrp->self);
5095 * extra ref on its kn.
5096 */
5097 kernfs_remove(cgrp->kn); 5140 kernfs_remove(cgrp->kn);
5098 5141
5099 if (parent && cgroup_is_threaded(cgrp)) 5142 if (parent && cgroup_is_threaded(cgrp))
@@ -5245,7 +5288,7 @@ int __init cgroup_init(void)
5245 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 5288 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5246 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); 5289 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5247 5290
5248 cgroup_stat_boot(); 5291 cgroup_rstat_boot();
5249 5292
5250 /* 5293 /*
5251 * The latency of the synchronize_sched() is too high for cgroups, 5294 * The latency of the synchronize_sched() is too high for cgroups,
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index defad3c5e7dc..d3bbb757ee49 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device);
362static int parse_resource(char *c, int *intval) 362static int parse_resource(char *c, int *intval)
363{ 363{
364 substring_t argstr; 364 substring_t argstr;
365 const char **table = &rdmacg_resource_names[0];
366 char *name, *value = c; 365 char *name, *value = c;
367 size_t len; 366 size_t len;
368 int ret, i = 0; 367 int ret, i;
369 368
370 name = strsep(&value, "="); 369 name = strsep(&value, "=");
371 if (!name || !value) 370 if (!name || !value)
372 return -EINVAL; 371 return -EINVAL;
373 372
374 len = strlen(value); 373 i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
374 if (i < 0)
375 return i;
375 376
376 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { 377 len = strlen(value);
377 if (strcmp(table[i], name))
378 continue;
379 378
380 argstr.from = value; 379 argstr.from = value;
381 argstr.to = value + len; 380 argstr.to = value + len;
382 381
383 ret = match_int(&argstr, intval); 382 ret = match_int(&argstr, intval);
384 if (ret >= 0) { 383 if (ret >= 0) {
385 if (*intval < 0) 384 if (*intval < 0)
386 break; 385 return -EINVAL;
387 return i; 386 return i;
388 } 387 }
389 if (strncmp(value, RDMACG_MAX_STR, len) == 0) { 388 if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
390 *intval = S32_MAX; 389 *intval = S32_MAX;
391 return i; 390 return i;
392 }
393 break;
394 } 391 }
395 return -EINVAL; 392 return -EINVAL;
396} 393}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
new file mode 100644
index 000000000000..d503d1a9007c
--- /dev/null
+++ b/kernel/cgroup/rstat.c
@@ -0,0 +1,416 @@
1#include "cgroup-internal.h"
2
3#include <linux/sched/cputime.h>
4
5static DEFINE_SPINLOCK(cgroup_rstat_lock);
6static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
7
8static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
9
10static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
11{
12 return per_cpu_ptr(cgrp->rstat_cpu, cpu);
13}
14
15/**
16 * cgroup_rstat_updated - keep track of updated rstat_cpu
17 * @cgrp: target cgroup
18 * @cpu: cpu on which rstat_cpu was updated
19 *
20 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
21 * rstat_cpu->updated_children list. See the comment on top of
22 * cgroup_rstat_cpu definition for details.
23 */
24void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
25{
26 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
27 struct cgroup *parent;
28 unsigned long flags;
29
30 /* nothing to do for root */
31 if (!cgroup_parent(cgrp))
32 return;
33
34 /*
35 * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we
36 * see NULL updated_next or they see our updated stat.
37 */
38 smp_mb();
39
40 /*
41 * Because @parent's updated_children is terminated with @parent
42 * instead of NULL, we can tell whether @cgrp is on the list by
43 * testing the next pointer for NULL.
44 */
45 if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
46 return;
47
48 raw_spin_lock_irqsave(cpu_lock, flags);
49
50 /* put @cgrp and all ancestors on the corresponding updated lists */
51 for (parent = cgroup_parent(cgrp); parent;
52 cgrp = parent, parent = cgroup_parent(cgrp)) {
53 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
54 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
55
56 /*
57 * Both additions and removals are bottom-up. If a cgroup
58 * is already in the tree, all ancestors are.
59 */
60 if (rstatc->updated_next)
61 break;
62
63 rstatc->updated_next = prstatc->updated_children;
64 prstatc->updated_children = cgrp;
65 }
66
67 raw_spin_unlock_irqrestore(cpu_lock, flags);
68}
69EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
70
71/**
72 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
73 * @pos: current position
74 * @root: root of the tree to traversal
75 * @cpu: target cpu
76 *
77 * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts
78 * the traversal and %NULL return indicates the end. During traversal,
79 * each returned cgroup is unlinked from the tree. Must be called with the
80 * matching cgroup_rstat_cpu_lock held.
81 *
82 * The only ordering guarantee is that, for a parent and a child pair
83 * covered by a given traversal, if a child is visited, its parent is
84 * guaranteed to be visited afterwards.
85 */
86static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
87 struct cgroup *root, int cpu)
88{
89 struct cgroup_rstat_cpu *rstatc;
90 struct cgroup *parent;
91
92 if (pos == root)
93 return NULL;
94
95 /*
96 * We're gonna walk down to the first leaf and visit/remove it. We
97 * can pick whatever unvisited node as the starting point.
98 */
99 if (!pos)
100 pos = root;
101 else
102 pos = cgroup_parent(pos);
103
104 /* walk down to the first leaf */
105 while (true) {
106 rstatc = cgroup_rstat_cpu(pos, cpu);
107 if (rstatc->updated_children == pos)
108 break;
109 pos = rstatc->updated_children;
110 }
111
112 /*
113 * Unlink @pos from the tree. As the updated_children list is
114 * singly linked, we have to walk it to find the removal point.
115 * However, due to the way we traverse, @pos will be the first
116 * child in most cases. The only exception is @root.
117 */
118 parent = cgroup_parent(pos);
119 if (parent && rstatc->updated_next) {
120 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
121 struct cgroup_rstat_cpu *nrstatc;
122 struct cgroup **nextp;
123
124 nextp = &prstatc->updated_children;
125 while (true) {
126 nrstatc = cgroup_rstat_cpu(*nextp, cpu);
127 if (*nextp == pos)
128 break;
129
130 WARN_ON_ONCE(*nextp == parent);
131 nextp = &nrstatc->updated_next;
132 }
133
134 *nextp = rstatc->updated_next;
135 rstatc->updated_next = NULL;
136
137 /*
138 * Paired with the one in cgroup_rstat_cpu_updated().
139 * Either they see NULL updated_next or we see their
140 * updated stat.
141 */
142 smp_mb();
143 }
144
145 return pos;
146}
147
148/* see cgroup_rstat_flush() */
149static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
150 __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
151{
152 int cpu;
153
154 lockdep_assert_held(&cgroup_rstat_lock);
155
156 for_each_possible_cpu(cpu) {
157 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
158 cpu);
159 struct cgroup *pos = NULL;
160
161 raw_spin_lock(cpu_lock);
162 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
163 struct cgroup_subsys_state *css;
164
165 cgroup_base_stat_flush(pos, cpu);
166
167 rcu_read_lock();
168 list_for_each_entry_rcu(css, &pos->rstat_css_list,
169 rstat_css_node)
170 css->ss->css_rstat_flush(css, cpu);
171 rcu_read_unlock();
172 }
173 raw_spin_unlock(cpu_lock);
174
175 /* if @may_sleep, play nice and yield if necessary */
176 if (may_sleep && (need_resched() ||
177 spin_needbreak(&cgroup_rstat_lock))) {
178 spin_unlock_irq(&cgroup_rstat_lock);
179 if (!cond_resched())
180 cpu_relax();
181 spin_lock_irq(&cgroup_rstat_lock);
182 }
183 }
184}
185
186/**
187 * cgroup_rstat_flush - flush stats in @cgrp's subtree
188 * @cgrp: target cgroup
189 *
190 * Collect all per-cpu stats in @cgrp's subtree into the global counters
191 * and propagate them upwards. After this function returns, all cgroups in
192 * the subtree have up-to-date ->stat.
193 *
194 * This also gets all cgroups in the subtree including @cgrp off the
195 * ->updated_children lists.
196 *
197 * This function may block.
198 */
199void cgroup_rstat_flush(struct cgroup *cgrp)
200{
201 might_sleep();
202
203 spin_lock_irq(&cgroup_rstat_lock);
204 cgroup_rstat_flush_locked(cgrp, true);
205 spin_unlock_irq(&cgroup_rstat_lock);
206}
207
208/**
209 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
210 * @cgrp: target cgroup
211 *
212 * This function can be called from any context.
213 */
214void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
215{
216 unsigned long flags;
217
218 spin_lock_irqsave(&cgroup_rstat_lock, flags);
219 cgroup_rstat_flush_locked(cgrp, false);
220 spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
221}
222
223/**
224 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
225 * @cgrp: target cgroup
226 *
227 * Flush stats in @cgrp's subtree and prevent further flushes. Must be
228 * paired with cgroup_rstat_flush_release().
229 *
230 * This function may block.
231 */
232void cgroup_rstat_flush_hold(struct cgroup *cgrp)
233 __acquires(&cgroup_rstat_lock)
234{
235 might_sleep();
236 spin_lock_irq(&cgroup_rstat_lock);
237 cgroup_rstat_flush_locked(cgrp, true);
238}
239
240/**
241 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
242 */
243void cgroup_rstat_flush_release(void)
244 __releases(&cgroup_rstat_lock)
245{
246 spin_unlock_irq(&cgroup_rstat_lock);
247}
248
249int cgroup_rstat_init(struct cgroup *cgrp)
250{
251 int cpu;
252
253 /* the root cgrp has rstat_cpu preallocated */
254 if (!cgrp->rstat_cpu) {
255 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
256 if (!cgrp->rstat_cpu)
257 return -ENOMEM;
258 }
259
260 /* ->updated_children list is self terminated */
261 for_each_possible_cpu(cpu) {
262 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
263
264 rstatc->updated_children = cgrp;
265 u64_stats_init(&rstatc->bsync);
266 }
267
268 return 0;
269}
270
271void cgroup_rstat_exit(struct cgroup *cgrp)
272{
273 int cpu;
274
275 cgroup_rstat_flush(cgrp);
276
277 /* sanity check */
278 for_each_possible_cpu(cpu) {
279 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
280
281 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
282 WARN_ON_ONCE(rstatc->updated_next))
283 return;
284 }
285
286 free_percpu(cgrp->rstat_cpu);
287 cgrp->rstat_cpu = NULL;
288}
289
290void __init cgroup_rstat_boot(void)
291{
292 int cpu;
293
294 for_each_possible_cpu(cpu)
295 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
296
297 BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
298}
299
300/*
301 * Functions for cgroup basic resource statistics implemented on top of
302 * rstat.
303 */
304static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat,
305 struct cgroup_base_stat *src_bstat)
306{
307 dst_bstat->cputime.utime += src_bstat->cputime.utime;
308 dst_bstat->cputime.stime += src_bstat->cputime.stime;
309 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
310}
311
312static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
313{
314 struct cgroup *parent = cgroup_parent(cgrp);
315 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
316 struct task_cputime *last_cputime = &rstatc->last_bstat.cputime;
317 struct task_cputime cputime;
318 struct cgroup_base_stat delta;
319 unsigned seq;
320
321 /* fetch the current per-cpu values */
322 do {
323 seq = __u64_stats_fetch_begin(&rstatc->bsync);
324 cputime = rstatc->bstat.cputime;
325 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
326
327 /* calculate the delta to propgate */
328 delta.cputime.utime = cputime.utime - last_cputime->utime;
329 delta.cputime.stime = cputime.stime - last_cputime->stime;
330 delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
331 last_cputime->sum_exec_runtime;
332 *last_cputime = cputime;
333
334 /* transfer the pending stat into delta */
335 cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat);
336 memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat));
337
338 /* propagate delta into the global stat and the parent's pending */
339 cgroup_base_stat_accumulate(&cgrp->bstat, &delta);
340 if (parent)
341 cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
342}
343
344static struct cgroup_rstat_cpu *
345cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
346{
347 struct cgroup_rstat_cpu *rstatc;
348
349 rstatc = get_cpu_ptr(cgrp->rstat_cpu);
350 u64_stats_update_begin(&rstatc->bsync);
351 return rstatc;
352}
353
354static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
355 struct cgroup_rstat_cpu *rstatc)
356{
357 u64_stats_update_end(&rstatc->bsync);
358 cgroup_rstat_updated(cgrp, smp_processor_id());
359 put_cpu_ptr(rstatc);
360}
361
362void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
363{
364 struct cgroup_rstat_cpu *rstatc;
365
366 rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
367 rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
368 cgroup_base_stat_cputime_account_end(cgrp, rstatc);
369}
370
371void __cgroup_account_cputime_field(struct cgroup *cgrp,
372 enum cpu_usage_stat index, u64 delta_exec)
373{
374 struct cgroup_rstat_cpu *rstatc;
375
376 rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
377
378 switch (index) {
379 case CPUTIME_USER:
380 case CPUTIME_NICE:
381 rstatc->bstat.cputime.utime += delta_exec;
382 break;
383 case CPUTIME_SYSTEM:
384 case CPUTIME_IRQ:
385 case CPUTIME_SOFTIRQ:
386 rstatc->bstat.cputime.stime += delta_exec;
387 break;
388 default:
389 break;
390 }
391
392 cgroup_base_stat_cputime_account_end(cgrp, rstatc);
393}
394
395void cgroup_base_stat_cputime_show(struct seq_file *seq)
396{
397 struct cgroup *cgrp = seq_css(seq)->cgroup;
398 u64 usage, utime, stime;
399
400 if (!cgroup_parent(cgrp))
401 return;
402
403 cgroup_rstat_flush_hold(cgrp);
404 usage = cgrp->bstat.cputime.sum_exec_runtime;
405 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
406 cgroup_rstat_flush_release();
407
408 do_div(usage, NSEC_PER_USEC);
409 do_div(utime, NSEC_PER_USEC);
410 do_div(stime, NSEC_PER_USEC);
411
412 seq_printf(seq, "usage_usec %llu\n"
413 "user_usec %llu\n"
414 "system_usec %llu\n",
415 usage, utime, stime);
416}
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
deleted file mode 100644
index 1e111dd455c4..000000000000
--- a/kernel/cgroup/stat.c
+++ /dev/null
@@ -1,338 +0,0 @@
1#include "cgroup-internal.h"
2
3#include <linux/sched/cputime.h>
4
5static DEFINE_MUTEX(cgroup_stat_mutex);
6static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
7
8static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
9{
10 return per_cpu_ptr(cgrp->cpu_stat, cpu);
11}
12
13/**
14 * cgroup_cpu_stat_updated - keep track of updated cpu_stat
15 * @cgrp: target cgroup
16 * @cpu: cpu on which cpu_stat was updated
17 *
18 * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
19 * cpu_stat->updated_children list. See the comment on top of
20 * cgroup_cpu_stat definition for details.
21 */
22static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
23{
24 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
25 struct cgroup *parent;
26 unsigned long flags;
27
28 /*
29 * Speculative already-on-list test. This may race leading to
30 * temporary inaccuracies, which is fine.
31 *
32 * Because @parent's updated_children is terminated with @parent
33 * instead of NULL, we can tell whether @cgrp is on the list by
34 * testing the next pointer for NULL.
35 */
36 if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
37 return;
38
39 raw_spin_lock_irqsave(cpu_lock, flags);
40
41 /* put @cgrp and all ancestors on the corresponding updated lists */
42 for (parent = cgroup_parent(cgrp); parent;
43 cgrp = parent, parent = cgroup_parent(cgrp)) {
44 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
45 struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
46
47 /*
48 * Both additions and removals are bottom-up. If a cgroup
49 * is already in the tree, all ancestors are.
50 */
51 if (cstat->updated_next)
52 break;
53
54 cstat->updated_next = pcstat->updated_children;
55 pcstat->updated_children = cgrp;
56 }
57
58 raw_spin_unlock_irqrestore(cpu_lock, flags);
59}
60
61/**
62 * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
63 * @pos: current position
64 * @root: root of the tree to traversal
65 * @cpu: target cpu
66 *
67 * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
68 * the traversal and %NULL return indicates the end. During traversal,
69 * each returned cgroup is unlinked from the tree. Must be called with the
70 * matching cgroup_cpu_stat_lock held.
71 *
72 * The only ordering guarantee is that, for a parent and a child pair
73 * covered by a given traversal, if a child is visited, its parent is
74 * guaranteed to be visited afterwards.
75 */
76static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
77 struct cgroup *root, int cpu)
78{
79 struct cgroup_cpu_stat *cstat;
80 struct cgroup *parent;
81
82 if (pos == root)
83 return NULL;
84
85 /*
86 * We're gonna walk down to the first leaf and visit/remove it. We
87 * can pick whatever unvisited node as the starting point.
88 */
89 if (!pos)
90 pos = root;
91 else
92 pos = cgroup_parent(pos);
93
94 /* walk down to the first leaf */
95 while (true) {
96 cstat = cgroup_cpu_stat(pos, cpu);
97 if (cstat->updated_children == pos)
98 break;
99 pos = cstat->updated_children;
100 }
101
102 /*
103 * Unlink @pos from the tree. As the updated_children list is
104 * singly linked, we have to walk it to find the removal point.
105 * However, due to the way we traverse, @pos will be the first
106 * child in most cases. The only exception is @root.
107 */
108 parent = cgroup_parent(pos);
109 if (parent && cstat->updated_next) {
110 struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
111 struct cgroup_cpu_stat *ncstat;
112 struct cgroup **nextp;
113
114 nextp = &pcstat->updated_children;
115 while (true) {
116 ncstat = cgroup_cpu_stat(*nextp, cpu);
117 if (*nextp == pos)
118 break;
119
120 WARN_ON_ONCE(*nextp == parent);
121 nextp = &ncstat->updated_next;
122 }
123
124 *nextp = cstat->updated_next;
125 cstat->updated_next = NULL;
126 }
127
128 return pos;
129}
130
131static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
132 struct cgroup_stat *src_stat)
133{
134 dst_stat->cputime.utime += src_stat->cputime.utime;
135 dst_stat->cputime.stime += src_stat->cputime.stime;
136 dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
137}
138
139static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
140{
141 struct cgroup *parent = cgroup_parent(cgrp);
142 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
143 struct task_cputime *last_cputime = &cstat->last_cputime;
144 struct task_cputime cputime;
145 struct cgroup_stat delta;
146 unsigned seq;
147
148 lockdep_assert_held(&cgroup_stat_mutex);
149
150 /* fetch the current per-cpu values */
151 do {
152 seq = __u64_stats_fetch_begin(&cstat->sync);
153 cputime = cstat->cputime;
154 } while (__u64_stats_fetch_retry(&cstat->sync, seq));
155
156 /* accumulate the deltas to propgate */
157 delta.cputime.utime = cputime.utime - last_cputime->utime;
158 delta.cputime.stime = cputime.stime - last_cputime->stime;
159 delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
160 last_cputime->sum_exec_runtime;
161 *last_cputime = cputime;
162
163 /* transfer the pending stat into delta */
164 cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
165 memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
166
167 /* propagate delta into the global stat and the parent's pending */
168 cgroup_stat_accumulate(&cgrp->stat, &delta);
169 if (parent)
170 cgroup_stat_accumulate(&parent->pending_stat, &delta);
171}
172
173/* see cgroup_stat_flush() */
174static void cgroup_stat_flush_locked(struct cgroup *cgrp)
175{
176 int cpu;
177
178 lockdep_assert_held(&cgroup_stat_mutex);
179
180 for_each_possible_cpu(cpu) {
181 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
182 struct cgroup *pos = NULL;
183
184 raw_spin_lock_irq(cpu_lock);
185 while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
186 cgroup_cpu_stat_flush_one(pos, cpu);
187 raw_spin_unlock_irq(cpu_lock);
188 }
189}
190
191/**
192 * cgroup_stat_flush - flush stats in @cgrp's subtree
193 * @cgrp: target cgroup
194 *
195 * Collect all per-cpu stats in @cgrp's subtree into the global counters
196 * and propagate them upwards. After this function returns, all cgroups in
197 * the subtree have up-to-date ->stat.
198 *
199 * This also gets all cgroups in the subtree including @cgrp off the
200 * ->updated_children lists.
201 */
202void cgroup_stat_flush(struct cgroup *cgrp)
203{
204 mutex_lock(&cgroup_stat_mutex);
205 cgroup_stat_flush_locked(cgrp);
206 mutex_unlock(&cgroup_stat_mutex);
207}
208
209static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
210{
211 struct cgroup_cpu_stat *cstat;
212
213 cstat = get_cpu_ptr(cgrp->cpu_stat);
214 u64_stats_update_begin(&cstat->sync);
215 return cstat;
216}
217
218static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
219 struct cgroup_cpu_stat *cstat)
220{
221 u64_stats_update_end(&cstat->sync);
222 cgroup_cpu_stat_updated(cgrp, smp_processor_id());
223 put_cpu_ptr(cstat);
224}
225
226void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
227{
228 struct cgroup_cpu_stat *cstat;
229
230 cstat = cgroup_cpu_stat_account_begin(cgrp);
231 cstat->cputime.sum_exec_runtime += delta_exec;
232 cgroup_cpu_stat_account_end(cgrp, cstat);
233}
234
235void __cgroup_account_cputime_field(struct cgroup *cgrp,
236 enum cpu_usage_stat index, u64 delta_exec)
237{
238 struct cgroup_cpu_stat *cstat;
239
240 cstat = cgroup_cpu_stat_account_begin(cgrp);
241
242 switch (index) {
243 case CPUTIME_USER:
244 case CPUTIME_NICE:
245 cstat->cputime.utime += delta_exec;
246 break;
247 case CPUTIME_SYSTEM:
248 case CPUTIME_IRQ:
249 case CPUTIME_SOFTIRQ:
250 cstat->cputime.stime += delta_exec;
251 break;
252 default:
253 break;
254 }
255
256 cgroup_cpu_stat_account_end(cgrp, cstat);
257}
258
259void cgroup_stat_show_cputime(struct seq_file *seq)
260{
261 struct cgroup *cgrp = seq_css(seq)->cgroup;
262 u64 usage, utime, stime;
263
264 if (!cgroup_parent(cgrp))
265 return;
266
267 mutex_lock(&cgroup_stat_mutex);
268
269 cgroup_stat_flush_locked(cgrp);
270
271 usage = cgrp->stat.cputime.sum_exec_runtime;
272 cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
273 &utime, &stime);
274
275 mutex_unlock(&cgroup_stat_mutex);
276
277 do_div(usage, NSEC_PER_USEC);
278 do_div(utime, NSEC_PER_USEC);
279 do_div(stime, NSEC_PER_USEC);
280
281 seq_printf(seq, "usage_usec %llu\n"
282 "user_usec %llu\n"
283 "system_usec %llu\n",
284 usage, utime, stime);
285}
286
287int cgroup_stat_init(struct cgroup *cgrp)
288{
289 int cpu;
290
291 /* the root cgrp has cpu_stat preallocated */
292 if (!cgrp->cpu_stat) {
293 cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
294 if (!cgrp->cpu_stat)
295 return -ENOMEM;
296 }
297
298 /* ->updated_children list is self terminated */
299 for_each_possible_cpu(cpu) {
300 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
301
302 cstat->updated_children = cgrp;
303 u64_stats_init(&cstat->sync);
304 }
305
306 prev_cputime_init(&cgrp->stat.prev_cputime);
307
308 return 0;
309}
310
311void cgroup_stat_exit(struct cgroup *cgrp)
312{
313 int cpu;
314
315 cgroup_stat_flush(cgrp);
316
317 /* sanity check */
318 for_each_possible_cpu(cpu) {
319 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
320
321 if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
322 WARN_ON_ONCE(cstat->updated_next))
323 return;
324 }
325
326 free_percpu(cgrp->cpu_stat);
327 cgrp->cpu_stat = NULL;
328}
329
330void __init cgroup_stat_boot(void)
331{
332 int cpu;
333
334 for_each_possible_cpu(cpu)
335 raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
336
337 BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
338}