diff options
-rw-r--r-- | include/linux/cgroup-defs.h | 52 | ||||
-rw-r--r-- | include/linux/cgroup.h | 12 | ||||
-rw-r--r-- | kernel/cgroup/Makefile | 2 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 11 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 105 | ||||
-rw-r--r-- | kernel/cgroup/rdma.c | 35 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 416 | ||||
-rw-r--r-- | kernel/cgroup/stat.c | 338 |
8 files changed, 554 insertions, 417 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index dc5b70449dc6..c0e68f903011 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
@@ -105,6 +105,8 @@ enum { | |||
105 | struct cgroup_file { | 105 | struct cgroup_file { |
106 | /* do not access any fields from outside cgroup core */ | 106 | /* do not access any fields from outside cgroup core */ |
107 | struct kernfs_node *kn; | 107 | struct kernfs_node *kn; |
108 | unsigned long notified_at; | ||
109 | struct timer_list notify_timer; | ||
108 | }; | 110 | }; |
109 | 111 | ||
110 | /* | 112 | /* |
@@ -128,6 +130,9 @@ struct cgroup_subsys_state { | |||
128 | struct list_head sibling; | 130 | struct list_head sibling; |
129 | struct list_head children; | 131 | struct list_head children; |
130 | 132 | ||
133 | /* flush target list anchored at cgrp->rstat_css_list */ | ||
134 | struct list_head rstat_css_node; | ||
135 | |||
131 | /* | 136 | /* |
132 | * PI: Subsys-unique ID. 0 is unused and root is always 1. The | 137 | * PI: Subsys-unique ID. 0 is unused and root is always 1. The |
133 | * matching css can be looked up using css_from_id(). | 138 | * matching css can be looked up using css_from_id(). |
@@ -256,12 +261,16 @@ struct css_set { | |||
256 | struct rcu_head rcu_head; | 261 | struct rcu_head rcu_head; |
257 | }; | 262 | }; |
258 | 263 | ||
264 | struct cgroup_base_stat { | ||
265 | struct task_cputime cputime; | ||
266 | }; | ||
267 | |||
259 | /* | 268 | /* |
260 | * cgroup basic resource usage statistics. Accounting is done per-cpu in | 269 | * rstat - cgroup scalable recursive statistics. Accounting is done |
261 | * cgroup_cpu_stat which is then lazily propagated up the hierarchy on | 270 | * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the |
262 | * reads. | 271 | * hierarchy on reads. |
263 | * | 272 | * |
264 | * When a stat gets updated, the cgroup_cpu_stat and its ancestors are | 273 | * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are |
265 | * linked into the updated tree. On the following read, propagation only | 274 | * linked into the updated tree. On the following read, propagation only |
266 | * considers and consumes the updated tree. This makes reading O(the | 275 | * considers and consumes the updated tree. This makes reading O(the |
267 | * number of descendants which have been active since last read) instead of | 276 | * number of descendants which have been active since last read) instead of |
@@ -271,20 +280,24 @@ struct css_set { | |||
271 | * aren't active and stat may be read frequently. The combination can | 280 | * aren't active and stat may be read frequently. The combination can |
272 | * become very expensive. By propagating selectively, increasing reading | 281 | * become very expensive. By propagating selectively, increasing reading |
273 | * frequency decreases the cost of each read. | 282 | * frequency decreases the cost of each read. |
283 | * | ||
284 | * This struct hosts both the fields which implement the above - | ||
285 | * updated_children and updated_next - and the fields which track basic | ||
286 | * resource statistics on top of it - bsync, bstat and last_bstat. | ||
274 | */ | 287 | */ |
275 | struct cgroup_cpu_stat { | 288 | struct cgroup_rstat_cpu { |
276 | /* | 289 | /* |
277 | * ->sync protects all the current counters. These are the only | 290 | * ->bsync protects ->bstat. These are the only fields which get |
278 | * fields which get updated in the hot path. | 291 | * updated in the hot path. |
279 | */ | 292 | */ |
280 | struct u64_stats_sync sync; | 293 | struct u64_stats_sync bsync; |
281 | struct task_cputime cputime; | 294 | struct cgroup_base_stat bstat; |
282 | 295 | ||
283 | /* | 296 | /* |
284 | * Snapshots at the last reading. These are used to calculate the | 297 | * Snapshots at the last reading. These are used to calculate the |
285 | * deltas to propagate to the global counters. | 298 | * deltas to propagate to the global counters. |
286 | */ | 299 | */ |
287 | struct task_cputime last_cputime; | 300 | struct cgroup_base_stat last_bstat; |
288 | 301 | ||
289 | /* | 302 | /* |
290 | * Child cgroups with stat updates on this cpu since the last read | 303 | * Child cgroups with stat updates on this cpu since the last read |
@@ -295,18 +308,12 @@ struct cgroup_cpu_stat { | |||
295 | * to the cgroup makes it unnecessary for each per-cpu struct to | 308 | * to the cgroup makes it unnecessary for each per-cpu struct to |
296 | * point back to the associated cgroup. | 309 | * point back to the associated cgroup. |
297 | * | 310 | * |
298 | * Protected by per-cpu cgroup_cpu_stat_lock. | 311 | * Protected by per-cpu cgroup_rstat_cpu_lock. |
299 | */ | 312 | */ |
300 | struct cgroup *updated_children; /* terminated by self cgroup */ | 313 | struct cgroup *updated_children; /* terminated by self cgroup */ |
301 | struct cgroup *updated_next; /* NULL iff not on the list */ | 314 | struct cgroup *updated_next; /* NULL iff not on the list */ |
302 | }; | 315 | }; |
303 | 316 | ||
304 | struct cgroup_stat { | ||
305 | /* per-cpu statistics are collected into the folowing global counters */ | ||
306 | struct task_cputime cputime; | ||
307 | struct prev_cputime prev_cputime; | ||
308 | }; | ||
309 | |||
310 | struct cgroup { | 317 | struct cgroup { |
311 | /* self css with NULL ->ss, points back to this cgroup */ | 318 | /* self css with NULL ->ss, points back to this cgroup */ |
312 | struct cgroup_subsys_state self; | 319 | struct cgroup_subsys_state self; |
@@ -406,10 +413,14 @@ struct cgroup { | |||
406 | */ | 413 | */ |
407 | struct cgroup *dom_cgrp; | 414 | struct cgroup *dom_cgrp; |
408 | 415 | ||
416 | /* per-cpu recursive resource statistics */ | ||
417 | struct cgroup_rstat_cpu __percpu *rstat_cpu; | ||
418 | struct list_head rstat_css_list; | ||
419 | |||
409 | /* cgroup basic resource statistics */ | 420 | /* cgroup basic resource statistics */ |
410 | struct cgroup_cpu_stat __percpu *cpu_stat; | 421 | struct cgroup_base_stat pending_bstat; /* pending from children */ |
411 | struct cgroup_stat pending_stat; /* pending from children */ | 422 | struct cgroup_base_stat bstat; |
412 | struct cgroup_stat stat; | 423 | struct prev_cputime prev_cputime; /* for printing out cputime */ |
413 | 424 | ||
414 | /* | 425 | /* |
415 | * list of pidlists, up to two for each namespace (one for procs, one | 426 | * list of pidlists, up to two for each namespace (one for procs, one |
@@ -570,6 +581,7 @@ struct cgroup_subsys { | |||
570 | void (*css_released)(struct cgroup_subsys_state *css); | 581 | void (*css_released)(struct cgroup_subsys_state *css); |
571 | void (*css_free)(struct cgroup_subsys_state *css); | 582 | void (*css_free)(struct cgroup_subsys_state *css); |
572 | void (*css_reset)(struct cgroup_subsys_state *css); | 583 | void (*css_reset)(struct cgroup_subsys_state *css); |
584 | void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); | ||
573 | int (*css_extra_stat_show)(struct seq_file *seq, | 585 | int (*css_extra_stat_show)(struct seq_file *seq, |
574 | struct cgroup_subsys_state *css); | 586 | struct cgroup_subsys_state *css); |
575 | 587 | ||
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 473e0c0abb86..c9fdf6f57913 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -690,11 +690,19 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, | |||
690 | char *buf, size_t buflen) {} | 690 | char *buf, size_t buflen) {} |
691 | #endif /* !CONFIG_CGROUPS */ | 691 | #endif /* !CONFIG_CGROUPS */ |
692 | 692 | ||
693 | #ifdef CONFIG_CGROUPS | ||
693 | /* | 694 | /* |
694 | * Basic resource stats. | 695 | * cgroup scalable recursive statistics. |
695 | */ | 696 | */ |
696 | #ifdef CONFIG_CGROUPS | 697 | void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); |
698 | void cgroup_rstat_flush(struct cgroup *cgrp); | ||
699 | void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); | ||
700 | void cgroup_rstat_flush_hold(struct cgroup *cgrp); | ||
701 | void cgroup_rstat_flush_release(void); | ||
697 | 702 | ||
703 | /* | ||
704 | * Basic resource stats. | ||
705 | */ | ||
698 | #ifdef CONFIG_CGROUP_CPUACCT | 706 | #ifdef CONFIG_CGROUP_CPUACCT |
699 | void cpuacct_charge(struct task_struct *tsk, u64 cputime); | 707 | void cpuacct_charge(struct task_struct *tsk, u64 cputime); |
700 | void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); | 708 | void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); |
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 2be89a003185..bfcdae896122 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | 1 | # SPDX-License-Identifier: GPL-2.0 |
2 | obj-y := cgroup.o stat.o namespace.o cgroup-v1.o | 2 | obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o |
3 | 3 | ||
4 | obj-$(CONFIG_CGROUP_FREEZER) += freezer.o | 4 | obj-$(CONFIG_CGROUP_FREEZER) += freezer.o |
5 | obj-$(CONFIG_CGROUP_PIDS) += pids.o | 5 | obj-$(CONFIG_CGROUP_PIDS) += pids.o |
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 0808a33d16d3..77ff1cd6a252 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h | |||
@@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | |||
201 | int cgroup_task_count(const struct cgroup *cgrp); | 201 | int cgroup_task_count(const struct cgroup *cgrp); |
202 | 202 | ||
203 | /* | 203 | /* |
204 | * stat.c | 204 | * rstat.c |
205 | */ | 205 | */ |
206 | void cgroup_stat_flush(struct cgroup *cgrp); | 206 | int cgroup_rstat_init(struct cgroup *cgrp); |
207 | int cgroup_stat_init(struct cgroup *cgrp); | 207 | void cgroup_rstat_exit(struct cgroup *cgrp); |
208 | void cgroup_stat_exit(struct cgroup *cgrp); | 208 | void cgroup_rstat_boot(void); |
209 | void cgroup_stat_show_cputime(struct seq_file *seq); | 209 | void cgroup_base_stat_cputime_show(struct seq_file *seq); |
210 | void cgroup_stat_boot(void); | ||
211 | 210 | ||
212 | /* | 211 | /* |
213 | * namespace.c | 212 | * namespace.c |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 12883656e63e..acb66713f9b6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include <linux/proc_ns.h> | 54 | #include <linux/proc_ns.h> |
55 | #include <linux/nsproxy.h> | 55 | #include <linux/nsproxy.h> |
56 | #include <linux/file.h> | 56 | #include <linux/file.h> |
57 | #include <linux/sched/cputime.h> | ||
57 | #include <net/sock.h> | 58 | #include <net/sock.h> |
58 | 59 | ||
59 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
@@ -61,6 +62,8 @@ | |||
61 | 62 | ||
62 | #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ | 63 | #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ |
63 | MAX_CFTYPE_NAME + 2) | 64 | MAX_CFTYPE_NAME + 2) |
65 | /* let's not notify more than 100 times per second */ | ||
66 | #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) | ||
64 | 67 | ||
65 | /* | 68 | /* |
66 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 69 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
@@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { | |||
142 | }; | 145 | }; |
143 | #undef SUBSYS | 146 | #undef SUBSYS |
144 | 147 | ||
145 | static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); | 148 | static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); |
146 | 149 | ||
147 | /* | 150 | /* |
148 | * The default hierarchy, reserved for the subsystems that are otherwise | 151 | * The default hierarchy, reserved for the subsystems that are otherwise |
149 | * unattached - it never has more than a single cgroup, and all tasks are | 152 | * unattached - it never has more than a single cgroup, and all tasks are |
150 | * part of that cgroup. | 153 | * part of that cgroup. |
151 | */ | 154 | */ |
152 | struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; | 155 | struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; |
153 | EXPORT_SYMBOL_GPL(cgrp_dfl_root); | 156 | EXPORT_SYMBOL_GPL(cgrp_dfl_root); |
154 | 157 | ||
155 | /* | 158 | /* |
@@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
1554 | spin_lock_irq(&cgroup_file_kn_lock); | 1557 | spin_lock_irq(&cgroup_file_kn_lock); |
1555 | cfile->kn = NULL; | 1558 | cfile->kn = NULL; |
1556 | spin_unlock_irq(&cgroup_file_kn_lock); | 1559 | spin_unlock_irq(&cgroup_file_kn_lock); |
1560 | |||
1561 | del_timer_sync(&cfile->notify_timer); | ||
1557 | } | 1562 | } |
1558 | 1563 | ||
1559 | kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); | 1564 | kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); |
@@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css) | |||
1573 | 1578 | ||
1574 | css->flags &= ~CSS_VISIBLE; | 1579 | css->flags &= ~CSS_VISIBLE; |
1575 | 1580 | ||
1576 | list_for_each_entry(cfts, &css->ss->cfts, node) | 1581 | if (!css->ss) { |
1582 | if (cgroup_on_dfl(cgrp)) | ||
1583 | cfts = cgroup_base_files; | ||
1584 | else | ||
1585 | cfts = cgroup1_base_files; | ||
1586 | |||
1577 | cgroup_addrm_files(css, cgrp, cfts, false); | 1587 | cgroup_addrm_files(css, cgrp, cfts, false); |
1588 | } else { | ||
1589 | list_for_each_entry(cfts, &css->ss->cfts, node) | ||
1590 | cgroup_addrm_files(css, cgrp, cfts, false); | ||
1591 | } | ||
1578 | } | 1592 | } |
1579 | 1593 | ||
1580 | /** | 1594 | /** |
@@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css) | |||
1598 | else | 1612 | else |
1599 | cfts = cgroup1_base_files; | 1613 | cfts = cgroup1_base_files; |
1600 | 1614 | ||
1601 | return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); | 1615 | ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); |
1602 | } | 1616 | if (ret < 0) |
1603 | 1617 | return ret; | |
1604 | list_for_each_entry(cfts, &css->ss->cfts, node) { | 1618 | } else { |
1605 | ret = cgroup_addrm_files(css, cgrp, cfts, true); | 1619 | list_for_each_entry(cfts, &css->ss->cfts, node) { |
1606 | if (ret < 0) { | 1620 | ret = cgroup_addrm_files(css, cgrp, cfts, true); |
1607 | failed_cfts = cfts; | 1621 | if (ret < 0) { |
1608 | goto err; | 1622 | failed_cfts = cfts; |
1623 | goto err; | ||
1624 | } | ||
1609 | } | 1625 | } |
1610 | } | 1626 | } |
1611 | 1627 | ||
@@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void) | |||
1782 | { | 1798 | { |
1783 | struct task_struct *p, *g; | 1799 | struct task_struct *p, *g; |
1784 | 1800 | ||
1785 | spin_lock_irq(&css_set_lock); | ||
1786 | |||
1787 | if (use_task_css_set_links) | ||
1788 | goto out_unlock; | ||
1789 | |||
1790 | use_task_css_set_links = true; | ||
1791 | |||
1792 | /* | 1801 | /* |
1793 | * We need tasklist_lock because RCU is not safe against | 1802 | * We need tasklist_lock because RCU is not safe against |
1794 | * while_each_thread(). Besides, a forking task that has passed | 1803 | * while_each_thread(). Besides, a forking task that has passed |
@@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void) | |||
1797 | * tasklist if we walk through it with RCU. | 1806 | * tasklist if we walk through it with RCU. |
1798 | */ | 1807 | */ |
1799 | read_lock(&tasklist_lock); | 1808 | read_lock(&tasklist_lock); |
1809 | spin_lock_irq(&css_set_lock); | ||
1810 | |||
1811 | if (use_task_css_set_links) | ||
1812 | goto out_unlock; | ||
1813 | |||
1814 | use_task_css_set_links = true; | ||
1815 | |||
1800 | do_each_thread(g, p) { | 1816 | do_each_thread(g, p) { |
1801 | WARN_ON_ONCE(!list_empty(&p->cg_list) || | 1817 | WARN_ON_ONCE(!list_empty(&p->cg_list) || |
1802 | task_css_set(p) != &init_css_set); | 1818 | task_css_set(p) != &init_css_set); |
@@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void) | |||
1824 | } | 1840 | } |
1825 | spin_unlock(&p->sighand->siglock); | 1841 | spin_unlock(&p->sighand->siglock); |
1826 | } while_each_thread(g, p); | 1842 | } while_each_thread(g, p); |
1827 | read_unlock(&tasklist_lock); | ||
1828 | out_unlock: | 1843 | out_unlock: |
1829 | spin_unlock_irq(&css_set_lock); | 1844 | spin_unlock_irq(&css_set_lock); |
1845 | read_unlock(&tasklist_lock); | ||
1830 | } | 1846 | } |
1831 | 1847 | ||
1832 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | 1848 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
@@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1844 | cgrp->dom_cgrp = cgrp; | 1860 | cgrp->dom_cgrp = cgrp; |
1845 | cgrp->max_descendants = INT_MAX; | 1861 | cgrp->max_descendants = INT_MAX; |
1846 | cgrp->max_depth = INT_MAX; | 1862 | cgrp->max_depth = INT_MAX; |
1863 | INIT_LIST_HEAD(&cgrp->rstat_css_list); | ||
1864 | prev_cputime_init(&cgrp->prev_cputime); | ||
1847 | 1865 | ||
1848 | for_each_subsys(ss, ssid) | 1866 | for_each_subsys(ss, ssid) |
1849 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); | 1867 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); |
@@ -3381,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v) | |||
3381 | struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; | 3399 | struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; |
3382 | int ret = 0; | 3400 | int ret = 0; |
3383 | 3401 | ||
3384 | cgroup_stat_show_cputime(seq); | 3402 | cgroup_base_stat_cputime_show(seq); |
3385 | #ifdef CONFIG_CGROUP_SCHED | 3403 | #ifdef CONFIG_CGROUP_SCHED |
3386 | ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); | 3404 | ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); |
3387 | #endif | 3405 | #endif |
@@ -3521,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) | |||
3521 | return kernfs_setattr(kn, &iattr); | 3539 | return kernfs_setattr(kn, &iattr); |
3522 | } | 3540 | } |
3523 | 3541 | ||
3542 | static void cgroup_file_notify_timer(struct timer_list *timer) | ||
3543 | { | ||
3544 | cgroup_file_notify(container_of(timer, struct cgroup_file, | ||
3545 | notify_timer)); | ||
3546 | } | ||
3547 | |||
3524 | static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, | 3548 | static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, |
3525 | struct cftype *cft) | 3549 | struct cftype *cft) |
3526 | { | 3550 | { |
@@ -3547,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, | |||
3547 | if (cft->file_offset) { | 3571 | if (cft->file_offset) { |
3548 | struct cgroup_file *cfile = (void *)css + cft->file_offset; | 3572 | struct cgroup_file *cfile = (void *)css + cft->file_offset; |
3549 | 3573 | ||
3574 | timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); | ||
3575 | |||
3550 | spin_lock_irq(&cgroup_file_kn_lock); | 3576 | spin_lock_irq(&cgroup_file_kn_lock); |
3551 | cfile->kn = kn; | 3577 | cfile->kn = kn; |
3552 | spin_unlock_irq(&cgroup_file_kn_lock); | 3578 | spin_unlock_irq(&cgroup_file_kn_lock); |
@@ -3796,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile) | |||
3796 | unsigned long flags; | 3822 | unsigned long flags; |
3797 | 3823 | ||
3798 | spin_lock_irqsave(&cgroup_file_kn_lock, flags); | 3824 | spin_lock_irqsave(&cgroup_file_kn_lock, flags); |
3799 | if (cfile->kn) | 3825 | if (cfile->kn) { |
3800 | kernfs_notify(cfile->kn); | 3826 | unsigned long last = cfile->notified_at; |
3827 | unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; | ||
3828 | |||
3829 | if (time_in_range(jiffies, last, next)) { | ||
3830 | timer_reduce(&cfile->notify_timer, next); | ||
3831 | } else { | ||
3832 | kernfs_notify(cfile->kn); | ||
3833 | cfile->notified_at = jiffies; | ||
3834 | } | ||
3835 | } | ||
3801 | spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); | 3836 | spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); |
3802 | } | 3837 | } |
3803 | 3838 | ||
@@ -4560,7 +4595,7 @@ static void css_free_rwork_fn(struct work_struct *work) | |||
4560 | cgroup_put(cgroup_parent(cgrp)); | 4595 | cgroup_put(cgroup_parent(cgrp)); |
4561 | kernfs_put(cgrp->kn); | 4596 | kernfs_put(cgrp->kn); |
4562 | if (cgroup_on_dfl(cgrp)) | 4597 | if (cgroup_on_dfl(cgrp)) |
4563 | cgroup_stat_exit(cgrp); | 4598 | cgroup_rstat_exit(cgrp); |
4564 | kfree(cgrp); | 4599 | kfree(cgrp); |
4565 | } else { | 4600 | } else { |
4566 | /* | 4601 | /* |
@@ -4587,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work) | |||
4587 | 4622 | ||
4588 | if (ss) { | 4623 | if (ss) { |
4589 | /* css release path */ | 4624 | /* css release path */ |
4625 | if (!list_empty(&css->rstat_css_node)) { | ||
4626 | cgroup_rstat_flush(cgrp); | ||
4627 | list_del_rcu(&css->rstat_css_node); | ||
4628 | } | ||
4629 | |||
4590 | cgroup_idr_replace(&ss->css_idr, NULL, css->id); | 4630 | cgroup_idr_replace(&ss->css_idr, NULL, css->id); |
4591 | if (ss->css_released) | 4631 | if (ss->css_released) |
4592 | ss->css_released(css); | 4632 | ss->css_released(css); |
@@ -4597,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work) | |||
4597 | trace_cgroup_release(cgrp); | 4637 | trace_cgroup_release(cgrp); |
4598 | 4638 | ||
4599 | if (cgroup_on_dfl(cgrp)) | 4639 | if (cgroup_on_dfl(cgrp)) |
4600 | cgroup_stat_flush(cgrp); | 4640 | cgroup_rstat_flush(cgrp); |
4601 | 4641 | ||
4602 | for (tcgrp = cgroup_parent(cgrp); tcgrp; | 4642 | for (tcgrp = cgroup_parent(cgrp); tcgrp; |
4603 | tcgrp = cgroup_parent(tcgrp)) | 4643 | tcgrp = cgroup_parent(tcgrp)) |
@@ -4648,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, | |||
4648 | css->id = -1; | 4688 | css->id = -1; |
4649 | INIT_LIST_HEAD(&css->sibling); | 4689 | INIT_LIST_HEAD(&css->sibling); |
4650 | INIT_LIST_HEAD(&css->children); | 4690 | INIT_LIST_HEAD(&css->children); |
4691 | INIT_LIST_HEAD(&css->rstat_css_node); | ||
4651 | css->serial_nr = css_serial_nr_next++; | 4692 | css->serial_nr = css_serial_nr_next++; |
4652 | atomic_set(&css->online_cnt, 0); | 4693 | atomic_set(&css->online_cnt, 0); |
4653 | 4694 | ||
@@ -4656,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css, | |||
4656 | css_get(css->parent); | 4697 | css_get(css->parent); |
4657 | } | 4698 | } |
4658 | 4699 | ||
4700 | if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) | ||
4701 | list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); | ||
4702 | |||
4659 | BUG_ON(cgroup_css(cgrp, ss)); | 4703 | BUG_ON(cgroup_css(cgrp, ss)); |
4660 | } | 4704 | } |
4661 | 4705 | ||
@@ -4757,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, | |||
4757 | err_list_del: | 4801 | err_list_del: |
4758 | list_del_rcu(&css->sibling); | 4802 | list_del_rcu(&css->sibling); |
4759 | err_free_css: | 4803 | err_free_css: |
4804 | list_del_rcu(&css->rstat_css_node); | ||
4760 | INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); | 4805 | INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); |
4761 | queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); | 4806 | queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); |
4762 | return ERR_PTR(err); | 4807 | return ERR_PTR(err); |
@@ -4785,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4785 | goto out_free_cgrp; | 4830 | goto out_free_cgrp; |
4786 | 4831 | ||
4787 | if (cgroup_on_dfl(parent)) { | 4832 | if (cgroup_on_dfl(parent)) { |
4788 | ret = cgroup_stat_init(cgrp); | 4833 | ret = cgroup_rstat_init(cgrp); |
4789 | if (ret) | 4834 | if (ret) |
4790 | goto out_cancel_ref; | 4835 | goto out_cancel_ref; |
4791 | } | 4836 | } |
@@ -4850,7 +4895,7 @@ out_idr_free: | |||
4850 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); | 4895 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
4851 | out_stat_exit: | 4896 | out_stat_exit: |
4852 | if (cgroup_on_dfl(parent)) | 4897 | if (cgroup_on_dfl(parent)) |
4853 | cgroup_stat_exit(cgrp); | 4898 | cgroup_rstat_exit(cgrp); |
4854 | out_cancel_ref: | 4899 | out_cancel_ref: |
4855 | percpu_ref_exit(&cgrp->self.refcnt); | 4900 | percpu_ref_exit(&cgrp->self.refcnt); |
4856 | out_free_cgrp: | 4901 | out_free_cgrp: |
@@ -5090,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
5090 | for_each_css(css, ssid, cgrp) | 5135 | for_each_css(css, ssid, cgrp) |
5091 | kill_css(css); | 5136 | kill_css(css); |
5092 | 5137 | ||
5093 | /* | 5138 | /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ |
5094 | * Remove @cgrp directory along with the base files. @cgrp has an | 5139 | css_clear_dir(&cgrp->self); |
5095 | * extra ref on its kn. | ||
5096 | */ | ||
5097 | kernfs_remove(cgrp->kn); | 5140 | kernfs_remove(cgrp->kn); |
5098 | 5141 | ||
5099 | if (parent && cgroup_is_threaded(cgrp)) | 5142 | if (parent && cgroup_is_threaded(cgrp)) |
@@ -5245,7 +5288,7 @@ int __init cgroup_init(void) | |||
5245 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); | 5288 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); |
5246 | BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); | 5289 | BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); |
5247 | 5290 | ||
5248 | cgroup_stat_boot(); | 5291 | cgroup_rstat_boot(); |
5249 | 5292 | ||
5250 | /* | 5293 | /* |
5251 | * The latency of the synchronize_sched() is too high for cgroups, | 5294 | * The latency of the synchronize_sched() is too high for cgroups, |
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index defad3c5e7dc..d3bbb757ee49 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c | |||
@@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device); | |||
362 | static int parse_resource(char *c, int *intval) | 362 | static int parse_resource(char *c, int *intval) |
363 | { | 363 | { |
364 | substring_t argstr; | 364 | substring_t argstr; |
365 | const char **table = &rdmacg_resource_names[0]; | ||
366 | char *name, *value = c; | 365 | char *name, *value = c; |
367 | size_t len; | 366 | size_t len; |
368 | int ret, i = 0; | 367 | int ret, i; |
369 | 368 | ||
370 | name = strsep(&value, "="); | 369 | name = strsep(&value, "="); |
371 | if (!name || !value) | 370 | if (!name || !value) |
372 | return -EINVAL; | 371 | return -EINVAL; |
373 | 372 | ||
374 | len = strlen(value); | 373 | i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); |
374 | if (i < 0) | ||
375 | return i; | ||
375 | 376 | ||
376 | for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { | 377 | len = strlen(value); |
377 | if (strcmp(table[i], name)) | ||
378 | continue; | ||
379 | 378 | ||
380 | argstr.from = value; | 379 | argstr.from = value; |
381 | argstr.to = value + len; | 380 | argstr.to = value + len; |
382 | 381 | ||
383 | ret = match_int(&argstr, intval); | 382 | ret = match_int(&argstr, intval); |
384 | if (ret >= 0) { | 383 | if (ret >= 0) { |
385 | if (*intval < 0) | 384 | if (*intval < 0) |
386 | break; | 385 | return -EINVAL; |
387 | return i; | 386 | return i; |
388 | } | 387 | } |
389 | if (strncmp(value, RDMACG_MAX_STR, len) == 0) { | 388 | if (strncmp(value, RDMACG_MAX_STR, len) == 0) { |
390 | *intval = S32_MAX; | 389 | *intval = S32_MAX; |
391 | return i; | 390 | return i; |
392 | } | ||
393 | break; | ||
394 | } | 391 | } |
395 | return -EINVAL; | 392 | return -EINVAL; |
396 | } | 393 | } |
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c new file mode 100644 index 000000000000..d503d1a9007c --- /dev/null +++ b/kernel/cgroup/rstat.c | |||
@@ -0,0 +1,416 @@ | |||
1 | #include "cgroup-internal.h" | ||
2 | |||
3 | #include <linux/sched/cputime.h> | ||
4 | |||
5 | static DEFINE_SPINLOCK(cgroup_rstat_lock); | ||
6 | static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); | ||
7 | |||
8 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); | ||
9 | |||
10 | static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) | ||
11 | { | ||
12 | return per_cpu_ptr(cgrp->rstat_cpu, cpu); | ||
13 | } | ||
14 | |||
15 | /** | ||
16 | * cgroup_rstat_updated - keep track of updated rstat_cpu | ||
17 | * @cgrp: target cgroup | ||
18 | * @cpu: cpu on which rstat_cpu was updated | ||
19 | * | ||
20 | * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching | ||
21 | * rstat_cpu->updated_children list. See the comment on top of | ||
22 | * cgroup_rstat_cpu definition for details. | ||
23 | */ | ||
24 | void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) | ||
25 | { | ||
26 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); | ||
27 | struct cgroup *parent; | ||
28 | unsigned long flags; | ||
29 | |||
30 | /* nothing to do for root */ | ||
31 | if (!cgroup_parent(cgrp)) | ||
32 | return; | ||
33 | |||
34 | /* | ||
35 | * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we | ||
36 | * see NULL updated_next or they see our updated stat. | ||
37 | */ | ||
38 | smp_mb(); | ||
39 | |||
40 | /* | ||
41 | * Because @parent's updated_children is terminated with @parent | ||
42 | * instead of NULL, we can tell whether @cgrp is on the list by | ||
43 | * testing the next pointer for NULL. | ||
44 | */ | ||
45 | if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) | ||
46 | return; | ||
47 | |||
48 | raw_spin_lock_irqsave(cpu_lock, flags); | ||
49 | |||
50 | /* put @cgrp and all ancestors on the corresponding updated lists */ | ||
51 | for (parent = cgroup_parent(cgrp); parent; | ||
52 | cgrp = parent, parent = cgroup_parent(cgrp)) { | ||
53 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); | ||
54 | struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); | ||
55 | |||
56 | /* | ||
57 | * Both additions and removals are bottom-up. If a cgroup | ||
58 | * is already in the tree, all ancestors are. | ||
59 | */ | ||
60 | if (rstatc->updated_next) | ||
61 | break; | ||
62 | |||
63 | rstatc->updated_next = prstatc->updated_children; | ||
64 | prstatc->updated_children = cgrp; | ||
65 | } | ||
66 | |||
67 | raw_spin_unlock_irqrestore(cpu_lock, flags); | ||
68 | } | ||
69 | EXPORT_SYMBOL_GPL(cgroup_rstat_updated); | ||
70 | |||
71 | /** | ||
72 | * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree | ||
73 | * @pos: current position | ||
74 | * @root: root of the tree to traversal | ||
75 | * @cpu: target cpu | ||
76 | * | ||
77 | * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts | ||
78 | * the traversal and %NULL return indicates the end. During traversal, | ||
79 | * each returned cgroup is unlinked from the tree. Must be called with the | ||
80 | * matching cgroup_rstat_cpu_lock held. | ||
81 | * | ||
82 | * The only ordering guarantee is that, for a parent and a child pair | ||
83 | * covered by a given traversal, if a child is visited, its parent is | ||
84 | * guaranteed to be visited afterwards. | ||
85 | */ | ||
86 | static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, | ||
87 | struct cgroup *root, int cpu) | ||
88 | { | ||
89 | struct cgroup_rstat_cpu *rstatc; | ||
90 | struct cgroup *parent; | ||
91 | |||
92 | if (pos == root) | ||
93 | return NULL; | ||
94 | |||
95 | /* | ||
96 | * We're gonna walk down to the first leaf and visit/remove it. We | ||
97 | * can pick whatever unvisited node as the starting point. | ||
98 | */ | ||
99 | if (!pos) | ||
100 | pos = root; | ||
101 | else | ||
102 | pos = cgroup_parent(pos); | ||
103 | |||
104 | /* walk down to the first leaf */ | ||
105 | while (true) { | ||
106 | rstatc = cgroup_rstat_cpu(pos, cpu); | ||
107 | if (rstatc->updated_children == pos) | ||
108 | break; | ||
109 | pos = rstatc->updated_children; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * Unlink @pos from the tree. As the updated_children list is | ||
114 | * singly linked, we have to walk it to find the removal point. | ||
115 | * However, due to the way we traverse, @pos will be the first | ||
116 | * child in most cases. The only exception is @root. | ||
117 | */ | ||
118 | parent = cgroup_parent(pos); | ||
119 | if (parent && rstatc->updated_next) { | ||
120 | struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); | ||
121 | struct cgroup_rstat_cpu *nrstatc; | ||
122 | struct cgroup **nextp; | ||
123 | |||
124 | nextp = &prstatc->updated_children; | ||
125 | while (true) { | ||
126 | nrstatc = cgroup_rstat_cpu(*nextp, cpu); | ||
127 | if (*nextp == pos) | ||
128 | break; | ||
129 | |||
130 | WARN_ON_ONCE(*nextp == parent); | ||
131 | nextp = &nrstatc->updated_next; | ||
132 | } | ||
133 | |||
134 | *nextp = rstatc->updated_next; | ||
135 | rstatc->updated_next = NULL; | ||
136 | |||
137 | /* | ||
138 | * Paired with the one in cgroup_rstat_cpu_updated(). | ||
139 | * Either they see NULL updated_next or we see their | ||
140 | * updated stat. | ||
141 | */ | ||
142 | smp_mb(); | ||
143 | } | ||
144 | |||
145 | return pos; | ||
146 | } | ||
147 | |||
148 | /* see cgroup_rstat_flush() */ | ||
149 | static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) | ||
150 | __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) | ||
151 | { | ||
152 | int cpu; | ||
153 | |||
154 | lockdep_assert_held(&cgroup_rstat_lock); | ||
155 | |||
156 | for_each_possible_cpu(cpu) { | ||
157 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, | ||
158 | cpu); | ||
159 | struct cgroup *pos = NULL; | ||
160 | |||
161 | raw_spin_lock(cpu_lock); | ||
162 | while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { | ||
163 | struct cgroup_subsys_state *css; | ||
164 | |||
165 | cgroup_base_stat_flush(pos, cpu); | ||
166 | |||
167 | rcu_read_lock(); | ||
168 | list_for_each_entry_rcu(css, &pos->rstat_css_list, | ||
169 | rstat_css_node) | ||
170 | css->ss->css_rstat_flush(css, cpu); | ||
171 | rcu_read_unlock(); | ||
172 | } | ||
173 | raw_spin_unlock(cpu_lock); | ||
174 | |||
175 | /* if @may_sleep, play nice and yield if necessary */ | ||
176 | if (may_sleep && (need_resched() || | ||
177 | spin_needbreak(&cgroup_rstat_lock))) { | ||
178 | spin_unlock_irq(&cgroup_rstat_lock); | ||
179 | if (!cond_resched()) | ||
180 | cpu_relax(); | ||
181 | spin_lock_irq(&cgroup_rstat_lock); | ||
182 | } | ||
183 | } | ||
184 | } | ||
185 | |||
186 | /** | ||
187 | * cgroup_rstat_flush - flush stats in @cgrp's subtree | ||
188 | * @cgrp: target cgroup | ||
189 | * | ||
190 | * Collect all per-cpu stats in @cgrp's subtree into the global counters | ||
191 | * and propagate them upwards. After this function returns, all cgroups in | ||
192 | * the subtree have up-to-date ->stat. | ||
193 | * | ||
194 | * This also gets all cgroups in the subtree including @cgrp off the | ||
195 | * ->updated_children lists. | ||
196 | * | ||
197 | * This function may block. | ||
198 | */ | ||
199 | void cgroup_rstat_flush(struct cgroup *cgrp) | ||
200 | { | ||
201 | might_sleep(); | ||
202 | |||
203 | spin_lock_irq(&cgroup_rstat_lock); | ||
204 | cgroup_rstat_flush_locked(cgrp, true); | ||
205 | spin_unlock_irq(&cgroup_rstat_lock); | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() | ||
210 | * @cgrp: target cgroup | ||
211 | * | ||
212 | * This function can be called from any context. | ||
213 | */ | ||
214 | void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) | ||
215 | { | ||
216 | unsigned long flags; | ||
217 | |||
218 | spin_lock_irqsave(&cgroup_rstat_lock, flags); | ||
219 | cgroup_rstat_flush_locked(cgrp, false); | ||
220 | spin_unlock_irqrestore(&cgroup_rstat_lock, flags); | ||
221 | } | ||
222 | |||
223 | /** | ||
224 | * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold | ||
225 | * @cgrp: target cgroup | ||
226 | * | ||
227 | * Flush stats in @cgrp's subtree and prevent further flushes. Must be | ||
228 | * paired with cgroup_rstat_flush_release(). | ||
229 | * | ||
230 | * This function may block. | ||
231 | */ | ||
232 | void cgroup_rstat_flush_hold(struct cgroup *cgrp) | ||
233 | __acquires(&cgroup_rstat_lock) | ||
234 | { | ||
235 | might_sleep(); | ||
236 | spin_lock_irq(&cgroup_rstat_lock); | ||
237 | cgroup_rstat_flush_locked(cgrp, true); | ||
238 | } | ||
239 | |||
240 | /** | ||
241 | * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() | ||
242 | */ | ||
243 | void cgroup_rstat_flush_release(void) | ||
244 | __releases(&cgroup_rstat_lock) | ||
245 | { | ||
246 | spin_unlock_irq(&cgroup_rstat_lock); | ||
247 | } | ||
248 | |||
249 | int cgroup_rstat_init(struct cgroup *cgrp) | ||
250 | { | ||
251 | int cpu; | ||
252 | |||
253 | /* the root cgrp has rstat_cpu preallocated */ | ||
254 | if (!cgrp->rstat_cpu) { | ||
255 | cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); | ||
256 | if (!cgrp->rstat_cpu) | ||
257 | return -ENOMEM; | ||
258 | } | ||
259 | |||
260 | /* ->updated_children list is self terminated */ | ||
261 | for_each_possible_cpu(cpu) { | ||
262 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); | ||
263 | |||
264 | rstatc->updated_children = cgrp; | ||
265 | u64_stats_init(&rstatc->bsync); | ||
266 | } | ||
267 | |||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | void cgroup_rstat_exit(struct cgroup *cgrp) | ||
272 | { | ||
273 | int cpu; | ||
274 | |||
275 | cgroup_rstat_flush(cgrp); | ||
276 | |||
277 | /* sanity check */ | ||
278 | for_each_possible_cpu(cpu) { | ||
279 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); | ||
280 | |||
281 | if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || | ||
282 | WARN_ON_ONCE(rstatc->updated_next)) | ||
283 | return; | ||
284 | } | ||
285 | |||
286 | free_percpu(cgrp->rstat_cpu); | ||
287 | cgrp->rstat_cpu = NULL; | ||
288 | } | ||
289 | |||
290 | void __init cgroup_rstat_boot(void) | ||
291 | { | ||
292 | int cpu; | ||
293 | |||
294 | for_each_possible_cpu(cpu) | ||
295 | raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); | ||
296 | |||
297 | BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * Functions for cgroup basic resource statistics implemented on top of | ||
302 | * rstat. | ||
303 | */ | ||
304 | static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, | ||
305 | struct cgroup_base_stat *src_bstat) | ||
306 | { | ||
307 | dst_bstat->cputime.utime += src_bstat->cputime.utime; | ||
308 | dst_bstat->cputime.stime += src_bstat->cputime.stime; | ||
309 | dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; | ||
310 | } | ||
311 | |||
312 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) | ||
313 | { | ||
314 | struct cgroup *parent = cgroup_parent(cgrp); | ||
315 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); | ||
316 | struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; | ||
317 | struct task_cputime cputime; | ||
318 | struct cgroup_base_stat delta; | ||
319 | unsigned seq; | ||
320 | |||
321 | /* fetch the current per-cpu values */ | ||
322 | do { | ||
323 | seq = __u64_stats_fetch_begin(&rstatc->bsync); | ||
324 | cputime = rstatc->bstat.cputime; | ||
325 | } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); | ||
326 | |||
327 | /* calculate the delta to propgate */ | ||
328 | delta.cputime.utime = cputime.utime - last_cputime->utime; | ||
329 | delta.cputime.stime = cputime.stime - last_cputime->stime; | ||
330 | delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - | ||
331 | last_cputime->sum_exec_runtime; | ||
332 | *last_cputime = cputime; | ||
333 | |||
334 | /* transfer the pending stat into delta */ | ||
335 | cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); | ||
336 | memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); | ||
337 | |||
338 | /* propagate delta into the global stat and the parent's pending */ | ||
339 | cgroup_base_stat_accumulate(&cgrp->bstat, &delta); | ||
340 | if (parent) | ||
341 | cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); | ||
342 | } | ||
343 | |||
344 | static struct cgroup_rstat_cpu * | ||
345 | cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) | ||
346 | { | ||
347 | struct cgroup_rstat_cpu *rstatc; | ||
348 | |||
349 | rstatc = get_cpu_ptr(cgrp->rstat_cpu); | ||
350 | u64_stats_update_begin(&rstatc->bsync); | ||
351 | return rstatc; | ||
352 | } | ||
353 | |||
354 | static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, | ||
355 | struct cgroup_rstat_cpu *rstatc) | ||
356 | { | ||
357 | u64_stats_update_end(&rstatc->bsync); | ||
358 | cgroup_rstat_updated(cgrp, smp_processor_id()); | ||
359 | put_cpu_ptr(rstatc); | ||
360 | } | ||
361 | |||
362 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) | ||
363 | { | ||
364 | struct cgroup_rstat_cpu *rstatc; | ||
365 | |||
366 | rstatc = cgroup_base_stat_cputime_account_begin(cgrp); | ||
367 | rstatc->bstat.cputime.sum_exec_runtime += delta_exec; | ||
368 | cgroup_base_stat_cputime_account_end(cgrp, rstatc); | ||
369 | } | ||
370 | |||
371 | void __cgroup_account_cputime_field(struct cgroup *cgrp, | ||
372 | enum cpu_usage_stat index, u64 delta_exec) | ||
373 | { | ||
374 | struct cgroup_rstat_cpu *rstatc; | ||
375 | |||
376 | rstatc = cgroup_base_stat_cputime_account_begin(cgrp); | ||
377 | |||
378 | switch (index) { | ||
379 | case CPUTIME_USER: | ||
380 | case CPUTIME_NICE: | ||
381 | rstatc->bstat.cputime.utime += delta_exec; | ||
382 | break; | ||
383 | case CPUTIME_SYSTEM: | ||
384 | case CPUTIME_IRQ: | ||
385 | case CPUTIME_SOFTIRQ: | ||
386 | rstatc->bstat.cputime.stime += delta_exec; | ||
387 | break; | ||
388 | default: | ||
389 | break; | ||
390 | } | ||
391 | |||
392 | cgroup_base_stat_cputime_account_end(cgrp, rstatc); | ||
393 | } | ||
394 | |||
395 | void cgroup_base_stat_cputime_show(struct seq_file *seq) | ||
396 | { | ||
397 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
398 | u64 usage, utime, stime; | ||
399 | |||
400 | if (!cgroup_parent(cgrp)) | ||
401 | return; | ||
402 | |||
403 | cgroup_rstat_flush_hold(cgrp); | ||
404 | usage = cgrp->bstat.cputime.sum_exec_runtime; | ||
405 | cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); | ||
406 | cgroup_rstat_flush_release(); | ||
407 | |||
408 | do_div(usage, NSEC_PER_USEC); | ||
409 | do_div(utime, NSEC_PER_USEC); | ||
410 | do_div(stime, NSEC_PER_USEC); | ||
411 | |||
412 | seq_printf(seq, "usage_usec %llu\n" | ||
413 | "user_usec %llu\n" | ||
414 | "system_usec %llu\n", | ||
415 | usage, utime, stime); | ||
416 | } | ||
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c deleted file mode 100644 index 1e111dd455c4..000000000000 --- a/kernel/cgroup/stat.c +++ /dev/null | |||
@@ -1,338 +0,0 @@ | |||
1 | #include "cgroup-internal.h" | ||
2 | |||
3 | #include <linux/sched/cputime.h> | ||
4 | |||
5 | static DEFINE_MUTEX(cgroup_stat_mutex); | ||
6 | static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); | ||
7 | |||
8 | static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) | ||
9 | { | ||
10 | return per_cpu_ptr(cgrp->cpu_stat, cpu); | ||
11 | } | ||
12 | |||
13 | /** | ||
14 | * cgroup_cpu_stat_updated - keep track of updated cpu_stat | ||
15 | * @cgrp: target cgroup | ||
16 | * @cpu: cpu on which cpu_stat was updated | ||
17 | * | ||
18 | * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching | ||
19 | * cpu_stat->updated_children list. See the comment on top of | ||
20 | * cgroup_cpu_stat definition for details. | ||
21 | */ | ||
22 | static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) | ||
23 | { | ||
24 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); | ||
25 | struct cgroup *parent; | ||
26 | unsigned long flags; | ||
27 | |||
28 | /* | ||
29 | * Speculative already-on-list test. This may race leading to | ||
30 | * temporary inaccuracies, which is fine. | ||
31 | * | ||
32 | * Because @parent's updated_children is terminated with @parent | ||
33 | * instead of NULL, we can tell whether @cgrp is on the list by | ||
34 | * testing the next pointer for NULL. | ||
35 | */ | ||
36 | if (cgroup_cpu_stat(cgrp, cpu)->updated_next) | ||
37 | return; | ||
38 | |||
39 | raw_spin_lock_irqsave(cpu_lock, flags); | ||
40 | |||
41 | /* put @cgrp and all ancestors on the corresponding updated lists */ | ||
42 | for (parent = cgroup_parent(cgrp); parent; | ||
43 | cgrp = parent, parent = cgroup_parent(cgrp)) { | ||
44 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | ||
45 | struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); | ||
46 | |||
47 | /* | ||
48 | * Both additions and removals are bottom-up. If a cgroup | ||
49 | * is already in the tree, all ancestors are. | ||
50 | */ | ||
51 | if (cstat->updated_next) | ||
52 | break; | ||
53 | |||
54 | cstat->updated_next = pcstat->updated_children; | ||
55 | pcstat->updated_children = cgrp; | ||
56 | } | ||
57 | |||
58 | raw_spin_unlock_irqrestore(cpu_lock, flags); | ||
59 | } | ||
60 | |||
61 | /** | ||
62 | * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree | ||
63 | * @pos: current position | ||
64 | * @root: root of the tree to traversal | ||
65 | * @cpu: target cpu | ||
66 | * | ||
67 | * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts | ||
68 | * the traversal and %NULL return indicates the end. During traversal, | ||
69 | * each returned cgroup is unlinked from the tree. Must be called with the | ||
70 | * matching cgroup_cpu_stat_lock held. | ||
71 | * | ||
72 | * The only ordering guarantee is that, for a parent and a child pair | ||
73 | * covered by a given traversal, if a child is visited, its parent is | ||
74 | * guaranteed to be visited afterwards. | ||
75 | */ | ||
76 | static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, | ||
77 | struct cgroup *root, int cpu) | ||
78 | { | ||
79 | struct cgroup_cpu_stat *cstat; | ||
80 | struct cgroup *parent; | ||
81 | |||
82 | if (pos == root) | ||
83 | return NULL; | ||
84 | |||
85 | /* | ||
86 | * We're gonna walk down to the first leaf and visit/remove it. We | ||
87 | * can pick whatever unvisited node as the starting point. | ||
88 | */ | ||
89 | if (!pos) | ||
90 | pos = root; | ||
91 | else | ||
92 | pos = cgroup_parent(pos); | ||
93 | |||
94 | /* walk down to the first leaf */ | ||
95 | while (true) { | ||
96 | cstat = cgroup_cpu_stat(pos, cpu); | ||
97 | if (cstat->updated_children == pos) | ||
98 | break; | ||
99 | pos = cstat->updated_children; | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * Unlink @pos from the tree. As the updated_children list is | ||
104 | * singly linked, we have to walk it to find the removal point. | ||
105 | * However, due to the way we traverse, @pos will be the first | ||
106 | * child in most cases. The only exception is @root. | ||
107 | */ | ||
108 | parent = cgroup_parent(pos); | ||
109 | if (parent && cstat->updated_next) { | ||
110 | struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); | ||
111 | struct cgroup_cpu_stat *ncstat; | ||
112 | struct cgroup **nextp; | ||
113 | |||
114 | nextp = &pcstat->updated_children; | ||
115 | while (true) { | ||
116 | ncstat = cgroup_cpu_stat(*nextp, cpu); | ||
117 | if (*nextp == pos) | ||
118 | break; | ||
119 | |||
120 | WARN_ON_ONCE(*nextp == parent); | ||
121 | nextp = &ncstat->updated_next; | ||
122 | } | ||
123 | |||
124 | *nextp = cstat->updated_next; | ||
125 | cstat->updated_next = NULL; | ||
126 | } | ||
127 | |||
128 | return pos; | ||
129 | } | ||
130 | |||
131 | static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, | ||
132 | struct cgroup_stat *src_stat) | ||
133 | { | ||
134 | dst_stat->cputime.utime += src_stat->cputime.utime; | ||
135 | dst_stat->cputime.stime += src_stat->cputime.stime; | ||
136 | dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; | ||
137 | } | ||
138 | |||
139 | static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) | ||
140 | { | ||
141 | struct cgroup *parent = cgroup_parent(cgrp); | ||
142 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | ||
143 | struct task_cputime *last_cputime = &cstat->last_cputime; | ||
144 | struct task_cputime cputime; | ||
145 | struct cgroup_stat delta; | ||
146 | unsigned seq; | ||
147 | |||
148 | lockdep_assert_held(&cgroup_stat_mutex); | ||
149 | |||
150 | /* fetch the current per-cpu values */ | ||
151 | do { | ||
152 | seq = __u64_stats_fetch_begin(&cstat->sync); | ||
153 | cputime = cstat->cputime; | ||
154 | } while (__u64_stats_fetch_retry(&cstat->sync, seq)); | ||
155 | |||
156 | /* accumulate the deltas to propgate */ | ||
157 | delta.cputime.utime = cputime.utime - last_cputime->utime; | ||
158 | delta.cputime.stime = cputime.stime - last_cputime->stime; | ||
159 | delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - | ||
160 | last_cputime->sum_exec_runtime; | ||
161 | *last_cputime = cputime; | ||
162 | |||
163 | /* transfer the pending stat into delta */ | ||
164 | cgroup_stat_accumulate(&delta, &cgrp->pending_stat); | ||
165 | memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); | ||
166 | |||
167 | /* propagate delta into the global stat and the parent's pending */ | ||
168 | cgroup_stat_accumulate(&cgrp->stat, &delta); | ||
169 | if (parent) | ||
170 | cgroup_stat_accumulate(&parent->pending_stat, &delta); | ||
171 | } | ||
172 | |||
173 | /* see cgroup_stat_flush() */ | ||
174 | static void cgroup_stat_flush_locked(struct cgroup *cgrp) | ||
175 | { | ||
176 | int cpu; | ||
177 | |||
178 | lockdep_assert_held(&cgroup_stat_mutex); | ||
179 | |||
180 | for_each_possible_cpu(cpu) { | ||
181 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); | ||
182 | struct cgroup *pos = NULL; | ||
183 | |||
184 | raw_spin_lock_irq(cpu_lock); | ||
185 | while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) | ||
186 | cgroup_cpu_stat_flush_one(pos, cpu); | ||
187 | raw_spin_unlock_irq(cpu_lock); | ||
188 | } | ||
189 | } | ||
190 | |||
191 | /** | ||
192 | * cgroup_stat_flush - flush stats in @cgrp's subtree | ||
193 | * @cgrp: target cgroup | ||
194 | * | ||
195 | * Collect all per-cpu stats in @cgrp's subtree into the global counters | ||
196 | * and propagate them upwards. After this function returns, all cgroups in | ||
197 | * the subtree have up-to-date ->stat. | ||
198 | * | ||
199 | * This also gets all cgroups in the subtree including @cgrp off the | ||
200 | * ->updated_children lists. | ||
201 | */ | ||
202 | void cgroup_stat_flush(struct cgroup *cgrp) | ||
203 | { | ||
204 | mutex_lock(&cgroup_stat_mutex); | ||
205 | cgroup_stat_flush_locked(cgrp); | ||
206 | mutex_unlock(&cgroup_stat_mutex); | ||
207 | } | ||
208 | |||
209 | static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) | ||
210 | { | ||
211 | struct cgroup_cpu_stat *cstat; | ||
212 | |||
213 | cstat = get_cpu_ptr(cgrp->cpu_stat); | ||
214 | u64_stats_update_begin(&cstat->sync); | ||
215 | return cstat; | ||
216 | } | ||
217 | |||
218 | static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, | ||
219 | struct cgroup_cpu_stat *cstat) | ||
220 | { | ||
221 | u64_stats_update_end(&cstat->sync); | ||
222 | cgroup_cpu_stat_updated(cgrp, smp_processor_id()); | ||
223 | put_cpu_ptr(cstat); | ||
224 | } | ||
225 | |||
226 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) | ||
227 | { | ||
228 | struct cgroup_cpu_stat *cstat; | ||
229 | |||
230 | cstat = cgroup_cpu_stat_account_begin(cgrp); | ||
231 | cstat->cputime.sum_exec_runtime += delta_exec; | ||
232 | cgroup_cpu_stat_account_end(cgrp, cstat); | ||
233 | } | ||
234 | |||
235 | void __cgroup_account_cputime_field(struct cgroup *cgrp, | ||
236 | enum cpu_usage_stat index, u64 delta_exec) | ||
237 | { | ||
238 | struct cgroup_cpu_stat *cstat; | ||
239 | |||
240 | cstat = cgroup_cpu_stat_account_begin(cgrp); | ||
241 | |||
242 | switch (index) { | ||
243 | case CPUTIME_USER: | ||
244 | case CPUTIME_NICE: | ||
245 | cstat->cputime.utime += delta_exec; | ||
246 | break; | ||
247 | case CPUTIME_SYSTEM: | ||
248 | case CPUTIME_IRQ: | ||
249 | case CPUTIME_SOFTIRQ: | ||
250 | cstat->cputime.stime += delta_exec; | ||
251 | break; | ||
252 | default: | ||
253 | break; | ||
254 | } | ||
255 | |||
256 | cgroup_cpu_stat_account_end(cgrp, cstat); | ||
257 | } | ||
258 | |||
259 | void cgroup_stat_show_cputime(struct seq_file *seq) | ||
260 | { | ||
261 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
262 | u64 usage, utime, stime; | ||
263 | |||
264 | if (!cgroup_parent(cgrp)) | ||
265 | return; | ||
266 | |||
267 | mutex_lock(&cgroup_stat_mutex); | ||
268 | |||
269 | cgroup_stat_flush_locked(cgrp); | ||
270 | |||
271 | usage = cgrp->stat.cputime.sum_exec_runtime; | ||
272 | cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, | ||
273 | &utime, &stime); | ||
274 | |||
275 | mutex_unlock(&cgroup_stat_mutex); | ||
276 | |||
277 | do_div(usage, NSEC_PER_USEC); | ||
278 | do_div(utime, NSEC_PER_USEC); | ||
279 | do_div(stime, NSEC_PER_USEC); | ||
280 | |||
281 | seq_printf(seq, "usage_usec %llu\n" | ||
282 | "user_usec %llu\n" | ||
283 | "system_usec %llu\n", | ||
284 | usage, utime, stime); | ||
285 | } | ||
286 | |||
287 | int cgroup_stat_init(struct cgroup *cgrp) | ||
288 | { | ||
289 | int cpu; | ||
290 | |||
291 | /* the root cgrp has cpu_stat preallocated */ | ||
292 | if (!cgrp->cpu_stat) { | ||
293 | cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); | ||
294 | if (!cgrp->cpu_stat) | ||
295 | return -ENOMEM; | ||
296 | } | ||
297 | |||
298 | /* ->updated_children list is self terminated */ | ||
299 | for_each_possible_cpu(cpu) { | ||
300 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | ||
301 | |||
302 | cstat->updated_children = cgrp; | ||
303 | u64_stats_init(&cstat->sync); | ||
304 | } | ||
305 | |||
306 | prev_cputime_init(&cgrp->stat.prev_cputime); | ||
307 | |||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | void cgroup_stat_exit(struct cgroup *cgrp) | ||
312 | { | ||
313 | int cpu; | ||
314 | |||
315 | cgroup_stat_flush(cgrp); | ||
316 | |||
317 | /* sanity check */ | ||
318 | for_each_possible_cpu(cpu) { | ||
319 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | ||
320 | |||
321 | if (WARN_ON_ONCE(cstat->updated_children != cgrp) || | ||
322 | WARN_ON_ONCE(cstat->updated_next)) | ||
323 | return; | ||
324 | } | ||
325 | |||
326 | free_percpu(cgrp->cpu_stat); | ||
327 | cgrp->cpu_stat = NULL; | ||
328 | } | ||
329 | |||
330 | void __init cgroup_stat_boot(void) | ||
331 | { | ||
332 | int cpu; | ||
333 | |||
334 | for_each_possible_cpu(cpu) | ||
335 | raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); | ||
336 | |||
337 | BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); | ||
338 | } | ||