aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/accounting/psi.txt9
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst18
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/cgroup.h15
-rw-r--r--include/linux/psi.h25
-rw-r--r--init/Kconfig4
-rw-r--r--kernel/cgroup/cgroup.c45
-rw-r--r--kernel/sched/psi.c118
8 files changed, 228 insertions, 10 deletions
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
index 3753a82f1cf5..b8ca28b60215 100644
--- a/Documentation/accounting/psi.txt
+++ b/Documentation/accounting/psi.txt
@@ -62,3 +62,12 @@ well as medium and long term trends. The total absolute stall time is
62tracked and exported as well, to allow detection of latency spikes 62tracked and exported as well, to allow detection of latency spikes
63which wouldn't necessarily make a dent in the time averages, or to 63which wouldn't necessarily make a dent in the time averages, or to
64average trends over custom time frames. 64average trends over custom time frames.
65
66Cgroup2 interface
67=================
68
69In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
70mounted, pressure stall information is also tracked for tasks grouped
71into cgroups. Each subdirectory in the cgroupfs mountpoint contains
72cpu.pressure, memory.pressure, and io.pressure files; the format is
73the same as the /proc/pressure/ files.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index caf36105a1c7..8389d6f72a77 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -966,6 +966,12 @@ All time durations are in microseconds.
966 $PERIOD duration. "max" for $MAX indicates no limit. If only 966 $PERIOD duration. "max" for $MAX indicates no limit. If only
967 one number is written, $MAX is updated. 967 one number is written, $MAX is updated.
968 968
969 cpu.pressure
970 A read-only nested-key file which exists on non-root cgroups.
971
972 Shows pressure stall information for CPU. See
973 Documentation/accounting/psi.txt for details.
974
969 975
970Memory 976Memory
971------ 977------
@@ -1271,6 +1277,12 @@ PAGE_SIZE multiple when read back.
1271 higher than the limit for an extended period of time. This 1277 higher than the limit for an extended period of time. This
1272 reduces the impact on the workload and memory management. 1278 reduces the impact on the workload and memory management.
1273 1279
1280 memory.pressure
1281 A read-only nested-key file which exists on non-root cgroups.
1282
1283 Shows pressure stall information for memory. See
1284 Documentation/accounting/psi.txt for details.
1285
1274 1286
1275Usage Guidelines 1287Usage Guidelines
1276~~~~~~~~~~~~~~~~ 1288~~~~~~~~~~~~~~~~
@@ -1408,6 +1420,12 @@ IO Interface Files
1408 1420
1409 8:16 rbps=2097152 wbps=max riops=max wiops=max 1421 8:16 rbps=2097152 wbps=max riops=max wiops=max
1410 1422
1423 io.pressure
1424 A read-only nested-key file which exists on non-root cgroups.
1425
1426 Shows pressure stall information for IO. See
1427 Documentation/accounting/psi.txt for details.
1428
1411 1429
1412Writeback 1430Writeback
1413~~~~~~~~~ 1431~~~~~~~~~
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 22254c1fe1c5..5e1694fe035b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -20,6 +20,7 @@
20#include <linux/u64_stats_sync.h> 20#include <linux/u64_stats_sync.h>
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22#include <linux/bpf-cgroup.h> 22#include <linux/bpf-cgroup.h>
23#include <linux/psi_types.h>
23 24
24#ifdef CONFIG_CGROUPS 25#ifdef CONFIG_CGROUPS
25 26
@@ -436,6 +437,9 @@ struct cgroup {
436 /* used to schedule release agent */ 437 /* used to schedule release agent */
437 struct work_struct release_agent_work; 438 struct work_struct release_agent_work;
438 439
440 /* used to track pressure stalls */
441 struct psi_group psi;
442
439 /* used to store eBPF programs */ 443 /* used to store eBPF programs */
440 struct cgroup_bpf bpf; 444 struct cgroup_bpf bpf;
441 445
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b622d6608605..9968332cceed 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
650 pr_cont_kernfs_path(cgrp->kn); 650 pr_cont_kernfs_path(cgrp->kn);
651} 651}
652 652
653static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
654{
655 return &cgrp->psi;
656}
657
653static inline void cgroup_init_kthreadd(void) 658static inline void cgroup_init_kthreadd(void)
654{ 659{
655 /* 660 /*
@@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
703 return NULL; 708 return NULL;
704} 709}
705 710
711static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
712{
713 return NULL;
714}
715
716static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
717{
718 return NULL;
719}
720
706static inline bool task_under_cgroup_hierarchy(struct task_struct *task, 721static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
707 struct cgroup *ancestor) 722 struct cgroup *ancestor)
708{ 723{
diff --git a/include/linux/psi.h b/include/linux/psi.h
index b0daf050de58..8e0725aac0aa 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -4,6 +4,9 @@
4#include <linux/psi_types.h> 4#include <linux/psi_types.h>
5#include <linux/sched.h> 5#include <linux/sched.h>
6 6
7struct seq_file;
8struct css_set;
9
7#ifdef CONFIG_PSI 10#ifdef CONFIG_PSI
8 11
9extern bool psi_disabled; 12extern bool psi_disabled;
@@ -16,6 +19,14 @@ void psi_memstall_tick(struct task_struct *task, int cpu);
16void psi_memstall_enter(unsigned long *flags); 19void psi_memstall_enter(unsigned long *flags);
17void psi_memstall_leave(unsigned long *flags); 20void psi_memstall_leave(unsigned long *flags);
18 21
22int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
23
24#ifdef CONFIG_CGROUPS
25int psi_cgroup_alloc(struct cgroup *cgrp);
26void psi_cgroup_free(struct cgroup *cgrp);
27void cgroup_move_task(struct task_struct *p, struct css_set *to);
28#endif
29
19#else /* CONFIG_PSI */ 30#else /* CONFIG_PSI */
20 31
21static inline void psi_init(void) {} 32static inline void psi_init(void) {}
@@ -23,6 +34,20 @@ static inline void psi_init(void) {}
23static inline void psi_memstall_enter(unsigned long *flags) {} 34static inline void psi_memstall_enter(unsigned long *flags) {}
24static inline void psi_memstall_leave(unsigned long *flags) {} 35static inline void psi_memstall_leave(unsigned long *flags) {}
25 36
37#ifdef CONFIG_CGROUPS
38static inline int psi_cgroup_alloc(struct cgroup *cgrp)
39{
40 return 0;
41}
42static inline void psi_cgroup_free(struct cgroup *cgrp)
43{
44}
45static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
46{
47 rcu_assign_pointer(p->cgroups, to);
48}
49#endif
50
26#endif /* CONFIG_PSI */ 51#endif /* CONFIG_PSI */
27 52
28#endif /* _LINUX_PSI_H */ 53#endif /* _LINUX_PSI_H */
diff --git a/init/Kconfig b/init/Kconfig
index 26e639df5517..a4112e95724a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -501,6 +501,10 @@ config PSI
501 the share of walltime in which some or all tasks in the system are 501 the share of walltime in which some or all tasks in the system are
502 delayed due to contention of the respective resource. 502 delayed due to contention of the respective resource.
503 503
504 In kernels with cgroup support, cgroups (cgroup2 only) will
505 have cpu.pressure, memory.pressure, and io.pressure files,
506 which aggregate pressure stalls for the grouped tasks only.
507
504 For more details see Documentation/accounting/psi.txt. 508 For more details see Documentation/accounting/psi.txt.
505 509
506 Say N if unsure. 510 Say N if unsure.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4c1cf0969a80..8b79318810ad 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,7 @@
55#include <linux/nsproxy.h> 55#include <linux/nsproxy.h>
56#include <linux/file.h> 56#include <linux/file.h>
57#include <linux/sched/cputime.h> 57#include <linux/sched/cputime.h>
58#include <linux/psi.h>
58#include <net/sock.h> 59#include <net/sock.h>
59 60
60#define CREATE_TRACE_POINTS 61#define CREATE_TRACE_POINTS
@@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task,
862 */ 863 */
863 WARN_ON_ONCE(task->flags & PF_EXITING); 864 WARN_ON_ONCE(task->flags & PF_EXITING);
864 865
865 rcu_assign_pointer(task->cgroups, to_cset); 866 cgroup_move_task(task, to_cset);
866 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : 867 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
867 &to_cset->tasks); 868 &to_cset->tasks);
868 } 869 }
@@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
3446 return ret; 3447 return ret;
3447} 3448}
3448 3449
3450#ifdef CONFIG_PSI
3451static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3452{
3453 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
3454}
3455static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3456{
3457 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
3458}
3459static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3460{
3461 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
3462}
3463#endif
3464
3449static int cgroup_file_open(struct kernfs_open_file *of) 3465static int cgroup_file_open(struct kernfs_open_file *of)
3450{ 3466{
3451 struct cftype *cft = of->kn->priv; 3467 struct cftype *cft = of->kn->priv;
@@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = {
4576 .flags = CFTYPE_NOT_ON_ROOT, 4592 .flags = CFTYPE_NOT_ON_ROOT,
4577 .seq_show = cpu_stat_show, 4593 .seq_show = cpu_stat_show,
4578 }, 4594 },
4595#ifdef CONFIG_PSI
4596 {
4597 .name = "io.pressure",
4598 .flags = CFTYPE_NOT_ON_ROOT,
4599 .seq_show = cgroup_io_pressure_show,
4600 },
4601 {
4602 .name = "memory.pressure",
4603 .flags = CFTYPE_NOT_ON_ROOT,
4604 .seq_show = cgroup_memory_pressure_show,
4605 },
4606 {
4607 .name = "cpu.pressure",
4608 .flags = CFTYPE_NOT_ON_ROOT,
4609 .seq_show = cgroup_cpu_pressure_show,
4610 },
4611#endif
4579 { } /* terminate */ 4612 { } /* terminate */
4580}; 4613};
4581 4614
@@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work)
4636 */ 4669 */
4637 cgroup_put(cgroup_parent(cgrp)); 4670 cgroup_put(cgroup_parent(cgrp));
4638 kernfs_put(cgrp->kn); 4671 kernfs_put(cgrp->kn);
4672 psi_cgroup_free(cgrp);
4639 if (cgroup_on_dfl(cgrp)) 4673 if (cgroup_on_dfl(cgrp))
4640 cgroup_rstat_exit(cgrp); 4674 cgroup_rstat_exit(cgrp);
4641 kfree(cgrp); 4675 kfree(cgrp);
@@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4892 cgrp->self.parent = &parent->self; 4926 cgrp->self.parent = &parent->self;
4893 cgrp->root = root; 4927 cgrp->root = root;
4894 cgrp->level = level; 4928 cgrp->level = level;
4895 ret = cgroup_bpf_inherit(cgrp); 4929
4930 ret = psi_cgroup_alloc(cgrp);
4896 if (ret) 4931 if (ret)
4897 goto out_idr_free; 4932 goto out_idr_free;
4898 4933
4934 ret = cgroup_bpf_inherit(cgrp);
4935 if (ret)
4936 goto out_psi_free;
4937
4899 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { 4938 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
4900 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; 4939 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
4901 4940
@@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
4933 4972
4934 return cgrp; 4973 return cgrp;
4935 4974
4975out_psi_free:
4976 psi_cgroup_free(cgrp);
4936out_idr_free: 4977out_idr_free:
4937 cgroup_idr_remove(&root->cgroup_idr, cgrp->id); 4978 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4938out_stat_exit: 4979out_stat_exit:
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 595414599b98..7cdecfc010af 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -473,9 +473,35 @@ static void psi_group_change(struct psi_group *group, int cpu,
473 schedule_delayed_work(&group->clock_work, PSI_FREQ); 473 schedule_delayed_work(&group->clock_work, PSI_FREQ);
474} 474}
475 475
476static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
477{
478#ifdef CONFIG_CGROUPS
479 struct cgroup *cgroup = NULL;
480
481 if (!*iter)
482 cgroup = task->cgroups->dfl_cgrp;
483 else if (*iter == &psi_system)
484 return NULL;
485 else
486 cgroup = cgroup_parent(*iter);
487
488 if (cgroup && cgroup_parent(cgroup)) {
489 *iter = cgroup;
490 return cgroup_psi(cgroup);
491 }
492#else
493 if (*iter)
494 return NULL;
495#endif
496 *iter = &psi_system;
497 return &psi_system;
498}
499
476void psi_task_change(struct task_struct *task, int clear, int set) 500void psi_task_change(struct task_struct *task, int clear, int set)
477{ 501{
478 int cpu = task_cpu(task); 502 int cpu = task_cpu(task);
503 struct psi_group *group;
504 void *iter = NULL;
479 505
480 if (!task->pid) 506 if (!task->pid)
481 return; 507 return;
@@ -492,17 +518,23 @@ void psi_task_change(struct task_struct *task, int clear, int set)
492 task->psi_flags &= ~clear; 518 task->psi_flags &= ~clear;
493 task->psi_flags |= set; 519 task->psi_flags |= set;
494 520
495 psi_group_change(&psi_system, cpu, clear, set); 521 while ((group = iterate_groups(task, &iter)))
522 psi_group_change(group, cpu, clear, set);
496} 523}
497 524
498void psi_memstall_tick(struct task_struct *task, int cpu) 525void psi_memstall_tick(struct task_struct *task, int cpu)
499{ 526{
500 struct psi_group_cpu *groupc; 527 struct psi_group *group;
528 void *iter = NULL;
501 529
502 groupc = per_cpu_ptr(psi_system.pcpu, cpu); 530 while ((group = iterate_groups(task, &iter))) {
503 write_seqcount_begin(&groupc->seq); 531 struct psi_group_cpu *groupc;
504 record_times(groupc, cpu, true); 532
505 write_seqcount_end(&groupc->seq); 533 groupc = per_cpu_ptr(group->pcpu, cpu);
534 write_seqcount_begin(&groupc->seq);
535 record_times(groupc, cpu, true);
536 write_seqcount_end(&groupc->seq);
537 }
506} 538}
507 539
508/** 540/**
@@ -565,8 +597,78 @@ void psi_memstall_leave(unsigned long *flags)
565 rq_unlock_irq(rq, &rf); 597 rq_unlock_irq(rq, &rf);
566} 598}
567 599
568static int psi_show(struct seq_file *m, struct psi_group *group, 600#ifdef CONFIG_CGROUPS
569 enum psi_res res) 601int psi_cgroup_alloc(struct cgroup *cgroup)
602{
603 if (psi_disabled)
604 return 0;
605
606 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
607 if (!cgroup->psi.pcpu)
608 return -ENOMEM;
609 group_init(&cgroup->psi);
610 return 0;
611}
612
613void psi_cgroup_free(struct cgroup *cgroup)
614{
615 if (psi_disabled)
616 return;
617
618 cancel_delayed_work_sync(&cgroup->psi.clock_work);
619 free_percpu(cgroup->psi.pcpu);
620}
621
622/**
623 * cgroup_move_task - move task to a different cgroup
624 * @task: the task
625 * @to: the target css_set
626 *
627 * Move task to a new cgroup and safely migrate its associated stall
628 * state between the different groups.
629 *
630 * This function acquires the task's rq lock to lock out concurrent
631 * changes to the task's scheduling state and - in case the task is
632 * running - concurrent changes to its stall state.
633 */
634void cgroup_move_task(struct task_struct *task, struct css_set *to)
635{
636 bool move_psi = !psi_disabled;
637 unsigned int task_flags = 0;
638 struct rq_flags rf;
639 struct rq *rq;
640
641 if (move_psi) {
642 rq = task_rq_lock(task, &rf);
643
644 if (task_on_rq_queued(task))
645 task_flags = TSK_RUNNING;
646 else if (task->in_iowait)
647 task_flags = TSK_IOWAIT;
648
649 if (task->flags & PF_MEMSTALL)
650 task_flags |= TSK_MEMSTALL;
651
652 if (task_flags)
653 psi_task_change(task, task_flags, 0);
654 }
655
656 /*
657 * Lame to do this here, but the scheduler cannot be locked
658 * from the outside, so we move cgroups from inside sched/.
659 */
660 rcu_assign_pointer(task->cgroups, to);
661
662 if (move_psi) {
663 if (task_flags)
664 psi_task_change(task, 0, task_flags);
665
666 task_rq_unlock(rq, task, &rf);
667 }
668}
669#endif /* CONFIG_CGROUPS */
670
671int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
570{ 672{
571 int full; 673 int full;
572 674