diff options
-rw-r--r-- | Documentation/accounting/psi.txt | 9 | ||||
-rw-r--r-- | Documentation/admin-guide/cgroup-v2.rst | 18 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 4 | ||||
-rw-r--r-- | include/linux/cgroup.h | 15 | ||||
-rw-r--r-- | include/linux/psi.h | 25 | ||||
-rw-r--r-- | init/Kconfig | 4 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 45 | ||||
-rw-r--r-- | kernel/sched/psi.c | 118 |
8 files changed, 228 insertions, 10 deletions
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index 3753a82f1cf5..b8ca28b60215 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt | |||
@@ -62,3 +62,12 @@ well as medium and long term trends. The total absolute stall time is | |||
62 | tracked and exported as well, to allow detection of latency spikes | 62 | tracked and exported as well, to allow detection of latency spikes |
63 | which wouldn't necessarily make a dent in the time averages, or to | 63 | which wouldn't necessarily make a dent in the time averages, or to |
64 | average trends over custom time frames. | 64 | average trends over custom time frames. |
65 | |||
66 | Cgroup2 interface | ||
67 | ================= | ||
68 | |||
69 | In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem | ||
70 | mounted, pressure stall information is also tracked for tasks grouped | ||
71 | into cgroups. Each subdirectory in the cgroupfs mountpoint contains | ||
72 | cpu.pressure, memory.pressure, and io.pressure files; the format is | ||
73 | the same as the /proc/pressure/ files. | ||
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index caf36105a1c7..8389d6f72a77 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst | |||
@@ -966,6 +966,12 @@ All time durations are in microseconds. | |||
966 | $PERIOD duration. "max" for $MAX indicates no limit. If only | 966 | $PERIOD duration. "max" for $MAX indicates no limit. If only |
967 | one number is written, $MAX is updated. | 967 | one number is written, $MAX is updated. |
968 | 968 | ||
969 | cpu.pressure | ||
970 | A read-only nested-key file which exists on non-root cgroups. | ||
971 | |||
972 | Shows pressure stall information for CPU. See | ||
973 | Documentation/accounting/psi.txt for details. | ||
974 | |||
969 | 975 | ||
970 | Memory | 976 | Memory |
971 | ------ | 977 | ------ |
@@ -1271,6 +1277,12 @@ PAGE_SIZE multiple when read back. | |||
1271 | higher than the limit for an extended period of time. This | 1277 | higher than the limit for an extended period of time. This |
1272 | reduces the impact on the workload and memory management. | 1278 | reduces the impact on the workload and memory management. |
1273 | 1279 | ||
1280 | memory.pressure | ||
1281 | A read-only nested-key file which exists on non-root cgroups. | ||
1282 | |||
1283 | Shows pressure stall information for memory. See | ||
1284 | Documentation/accounting/psi.txt for details. | ||
1285 | |||
1274 | 1286 | ||
1275 | Usage Guidelines | 1287 | Usage Guidelines |
1276 | ~~~~~~~~~~~~~~~~ | 1288 | ~~~~~~~~~~~~~~~~ |
@@ -1408,6 +1420,12 @@ IO Interface Files | |||
1408 | 1420 | ||
1409 | 8:16 rbps=2097152 wbps=max riops=max wiops=max | 1421 | 8:16 rbps=2097152 wbps=max riops=max wiops=max |
1410 | 1422 | ||
1423 | io.pressure | ||
1424 | A read-only nested-key file which exists on non-root cgroups. | ||
1425 | |||
1426 | Shows pressure stall information for IO. See | ||
1427 | Documentation/accounting/psi.txt for details. | ||
1428 | |||
1411 | 1429 | ||
1412 | Writeback | 1430 | Writeback |
1413 | ~~~~~~~~~ | 1431 | ~~~~~~~~~ |
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 22254c1fe1c5..5e1694fe035b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/u64_stats_sync.h> | 20 | #include <linux/u64_stats_sync.h> |
21 | #include <linux/workqueue.h> | 21 | #include <linux/workqueue.h> |
22 | #include <linux/bpf-cgroup.h> | 22 | #include <linux/bpf-cgroup.h> |
23 | #include <linux/psi_types.h> | ||
23 | 24 | ||
24 | #ifdef CONFIG_CGROUPS | 25 | #ifdef CONFIG_CGROUPS |
25 | 26 | ||
@@ -436,6 +437,9 @@ struct cgroup { | |||
436 | /* used to schedule release agent */ | 437 | /* used to schedule release agent */ |
437 | struct work_struct release_agent_work; | 438 | struct work_struct release_agent_work; |
438 | 439 | ||
440 | /* used to track pressure stalls */ | ||
441 | struct psi_group psi; | ||
442 | |||
439 | /* used to store eBPF programs */ | 443 | /* used to store eBPF programs */ |
440 | struct cgroup_bpf bpf; | 444 | struct cgroup_bpf bpf; |
441 | 445 | ||
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b622d6608605..9968332cceed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) | |||
650 | pr_cont_kernfs_path(cgrp->kn); | 650 | pr_cont_kernfs_path(cgrp->kn); |
651 | } | 651 | } |
652 | 652 | ||
653 | static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) | ||
654 | { | ||
655 | return &cgrp->psi; | ||
656 | } | ||
657 | |||
653 | static inline void cgroup_init_kthreadd(void) | 658 | static inline void cgroup_init_kthreadd(void) |
654 | { | 659 | { |
655 | /* | 660 | /* |
@@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) | |||
703 | return NULL; | 708 | return NULL; |
704 | } | 709 | } |
705 | 710 | ||
711 | static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) | ||
712 | { | ||
713 | return NULL; | ||
714 | } | ||
715 | |||
716 | static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) | ||
717 | { | ||
718 | return NULL; | ||
719 | } | ||
720 | |||
706 | static inline bool task_under_cgroup_hierarchy(struct task_struct *task, | 721 | static inline bool task_under_cgroup_hierarchy(struct task_struct *task, |
707 | struct cgroup *ancestor) | 722 | struct cgroup *ancestor) |
708 | { | 723 | { |
diff --git a/include/linux/psi.h b/include/linux/psi.h index b0daf050de58..8e0725aac0aa 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h | |||
@@ -4,6 +4,9 @@ | |||
4 | #include <linux/psi_types.h> | 4 | #include <linux/psi_types.h> |
5 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | 6 | ||
7 | struct seq_file; | ||
8 | struct css_set; | ||
9 | |||
7 | #ifdef CONFIG_PSI | 10 | #ifdef CONFIG_PSI |
8 | 11 | ||
9 | extern bool psi_disabled; | 12 | extern bool psi_disabled; |
@@ -16,6 +19,14 @@ void psi_memstall_tick(struct task_struct *task, int cpu); | |||
16 | void psi_memstall_enter(unsigned long *flags); | 19 | void psi_memstall_enter(unsigned long *flags); |
17 | void psi_memstall_leave(unsigned long *flags); | 20 | void psi_memstall_leave(unsigned long *flags); |
18 | 21 | ||
22 | int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); | ||
23 | |||
24 | #ifdef CONFIG_CGROUPS | ||
25 | int psi_cgroup_alloc(struct cgroup *cgrp); | ||
26 | void psi_cgroup_free(struct cgroup *cgrp); | ||
27 | void cgroup_move_task(struct task_struct *p, struct css_set *to); | ||
28 | #endif | ||
29 | |||
19 | #else /* CONFIG_PSI */ | 30 | #else /* CONFIG_PSI */ |
20 | 31 | ||
21 | static inline void psi_init(void) {} | 32 | static inline void psi_init(void) {} |
@@ -23,6 +34,20 @@ static inline void psi_init(void) {} | |||
23 | static inline void psi_memstall_enter(unsigned long *flags) {} | 34 | static inline void psi_memstall_enter(unsigned long *flags) {} |
24 | static inline void psi_memstall_leave(unsigned long *flags) {} | 35 | static inline void psi_memstall_leave(unsigned long *flags) {} |
25 | 36 | ||
37 | #ifdef CONFIG_CGROUPS | ||
38 | static inline int psi_cgroup_alloc(struct cgroup *cgrp) | ||
39 | { | ||
40 | return 0; | ||
41 | } | ||
42 | static inline void psi_cgroup_free(struct cgroup *cgrp) | ||
43 | { | ||
44 | } | ||
45 | static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) | ||
46 | { | ||
47 | rcu_assign_pointer(p->cgroups, to); | ||
48 | } | ||
49 | #endif | ||
50 | |||
26 | #endif /* CONFIG_PSI */ | 51 | #endif /* CONFIG_PSI */ |
27 | 52 | ||
28 | #endif /* _LINUX_PSI_H */ | 53 | #endif /* _LINUX_PSI_H */ |
diff --git a/init/Kconfig b/init/Kconfig index 26e639df5517..a4112e95724a 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -501,6 +501,10 @@ config PSI | |||
501 | the share of walltime in which some or all tasks in the system are | 501 | the share of walltime in which some or all tasks in the system are |
502 | delayed due to contention of the respective resource. | 502 | delayed due to contention of the respective resource. |
503 | 503 | ||
504 | In kernels with cgroup support, cgroups (cgroup2 only) will | ||
505 | have cpu.pressure, memory.pressure, and io.pressure files, | ||
506 | which aggregate pressure stalls for the grouped tasks only. | ||
507 | |||
504 | For more details see Documentation/accounting/psi.txt. | 508 | For more details see Documentation/accounting/psi.txt. |
505 | 509 | ||
506 | Say N if unsure. | 510 | Say N if unsure. |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4c1cf0969a80..8b79318810ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/nsproxy.h> | 55 | #include <linux/nsproxy.h> |
56 | #include <linux/file.h> | 56 | #include <linux/file.h> |
57 | #include <linux/sched/cputime.h> | 57 | #include <linux/sched/cputime.h> |
58 | #include <linux/psi.h> | ||
58 | #include <net/sock.h> | 59 | #include <net/sock.h> |
59 | 60 | ||
60 | #define CREATE_TRACE_POINTS | 61 | #define CREATE_TRACE_POINTS |
@@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task, | |||
862 | */ | 863 | */ |
863 | WARN_ON_ONCE(task->flags & PF_EXITING); | 864 | WARN_ON_ONCE(task->flags & PF_EXITING); |
864 | 865 | ||
865 | rcu_assign_pointer(task->cgroups, to_cset); | 866 | cgroup_move_task(task, to_cset); |
866 | list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : | 867 | list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : |
867 | &to_cset->tasks); | 868 | &to_cset->tasks); |
868 | } | 869 | } |
@@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) | |||
3446 | return ret; | 3447 | return ret; |
3447 | } | 3448 | } |
3448 | 3449 | ||
3450 | #ifdef CONFIG_PSI | ||
3451 | static int cgroup_io_pressure_show(struct seq_file *seq, void *v) | ||
3452 | { | ||
3453 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); | ||
3454 | } | ||
3455 | static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) | ||
3456 | { | ||
3457 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); | ||
3458 | } | ||
3459 | static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) | ||
3460 | { | ||
3461 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); | ||
3462 | } | ||
3463 | #endif | ||
3464 | |||
3449 | static int cgroup_file_open(struct kernfs_open_file *of) | 3465 | static int cgroup_file_open(struct kernfs_open_file *of) |
3450 | { | 3466 | { |
3451 | struct cftype *cft = of->kn->priv; | 3467 | struct cftype *cft = of->kn->priv; |
@@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = { | |||
4576 | .flags = CFTYPE_NOT_ON_ROOT, | 4592 | .flags = CFTYPE_NOT_ON_ROOT, |
4577 | .seq_show = cpu_stat_show, | 4593 | .seq_show = cpu_stat_show, |
4578 | }, | 4594 | }, |
4595 | #ifdef CONFIG_PSI | ||
4596 | { | ||
4597 | .name = "io.pressure", | ||
4598 | .flags = CFTYPE_NOT_ON_ROOT, | ||
4599 | .seq_show = cgroup_io_pressure_show, | ||
4600 | }, | ||
4601 | { | ||
4602 | .name = "memory.pressure", | ||
4603 | .flags = CFTYPE_NOT_ON_ROOT, | ||
4604 | .seq_show = cgroup_memory_pressure_show, | ||
4605 | }, | ||
4606 | { | ||
4607 | .name = "cpu.pressure", | ||
4608 | .flags = CFTYPE_NOT_ON_ROOT, | ||
4609 | .seq_show = cgroup_cpu_pressure_show, | ||
4610 | }, | ||
4611 | #endif | ||
4579 | { } /* terminate */ | 4612 | { } /* terminate */ |
4580 | }; | 4613 | }; |
4581 | 4614 | ||
@@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work) | |||
4636 | */ | 4669 | */ |
4637 | cgroup_put(cgroup_parent(cgrp)); | 4670 | cgroup_put(cgroup_parent(cgrp)); |
4638 | kernfs_put(cgrp->kn); | 4671 | kernfs_put(cgrp->kn); |
4672 | psi_cgroup_free(cgrp); | ||
4639 | if (cgroup_on_dfl(cgrp)) | 4673 | if (cgroup_on_dfl(cgrp)) |
4640 | cgroup_rstat_exit(cgrp); | 4674 | cgroup_rstat_exit(cgrp); |
4641 | kfree(cgrp); | 4675 | kfree(cgrp); |
@@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4892 | cgrp->self.parent = &parent->self; | 4926 | cgrp->self.parent = &parent->self; |
4893 | cgrp->root = root; | 4927 | cgrp->root = root; |
4894 | cgrp->level = level; | 4928 | cgrp->level = level; |
4895 | ret = cgroup_bpf_inherit(cgrp); | 4929 | |
4930 | ret = psi_cgroup_alloc(cgrp); | ||
4896 | if (ret) | 4931 | if (ret) |
4897 | goto out_idr_free; | 4932 | goto out_idr_free; |
4898 | 4933 | ||
4934 | ret = cgroup_bpf_inherit(cgrp); | ||
4935 | if (ret) | ||
4936 | goto out_psi_free; | ||
4937 | |||
4899 | for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { | 4938 | for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { |
4900 | cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; | 4939 | cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; |
4901 | 4940 | ||
@@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4933 | 4972 | ||
4934 | return cgrp; | 4973 | return cgrp; |
4935 | 4974 | ||
4975 | out_psi_free: | ||
4976 | psi_cgroup_free(cgrp); | ||
4936 | out_idr_free: | 4977 | out_idr_free: |
4937 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); | 4978 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
4938 | out_stat_exit: | 4979 | out_stat_exit: |
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 595414599b98..7cdecfc010af 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c | |||
@@ -473,9 +473,35 @@ static void psi_group_change(struct psi_group *group, int cpu, | |||
473 | schedule_delayed_work(&group->clock_work, PSI_FREQ); | 473 | schedule_delayed_work(&group->clock_work, PSI_FREQ); |
474 | } | 474 | } |
475 | 475 | ||
476 | static struct psi_group *iterate_groups(struct task_struct *task, void **iter) | ||
477 | { | ||
478 | #ifdef CONFIG_CGROUPS | ||
479 | struct cgroup *cgroup = NULL; | ||
480 | |||
481 | if (!*iter) | ||
482 | cgroup = task->cgroups->dfl_cgrp; | ||
483 | else if (*iter == &psi_system) | ||
484 | return NULL; | ||
485 | else | ||
486 | cgroup = cgroup_parent(*iter); | ||
487 | |||
488 | if (cgroup && cgroup_parent(cgroup)) { | ||
489 | *iter = cgroup; | ||
490 | return cgroup_psi(cgroup); | ||
491 | } | ||
492 | #else | ||
493 | if (*iter) | ||
494 | return NULL; | ||
495 | #endif | ||
496 | *iter = &psi_system; | ||
497 | return &psi_system; | ||
498 | } | ||
499 | |||
476 | void psi_task_change(struct task_struct *task, int clear, int set) | 500 | void psi_task_change(struct task_struct *task, int clear, int set) |
477 | { | 501 | { |
478 | int cpu = task_cpu(task); | 502 | int cpu = task_cpu(task); |
503 | struct psi_group *group; | ||
504 | void *iter = NULL; | ||
479 | 505 | ||
480 | if (!task->pid) | 506 | if (!task->pid) |
481 | return; | 507 | return; |
@@ -492,17 +518,23 @@ void psi_task_change(struct task_struct *task, int clear, int set) | |||
492 | task->psi_flags &= ~clear; | 518 | task->psi_flags &= ~clear; |
493 | task->psi_flags |= set; | 519 | task->psi_flags |= set; |
494 | 520 | ||
495 | psi_group_change(&psi_system, cpu, clear, set); | 521 | while ((group = iterate_groups(task, &iter))) |
522 | psi_group_change(group, cpu, clear, set); | ||
496 | } | 523 | } |
497 | 524 | ||
498 | void psi_memstall_tick(struct task_struct *task, int cpu) | 525 | void psi_memstall_tick(struct task_struct *task, int cpu) |
499 | { | 526 | { |
500 | struct psi_group_cpu *groupc; | 527 | struct psi_group *group; |
528 | void *iter = NULL; | ||
501 | 529 | ||
502 | groupc = per_cpu_ptr(psi_system.pcpu, cpu); | 530 | while ((group = iterate_groups(task, &iter))) { |
503 | write_seqcount_begin(&groupc->seq); | 531 | struct psi_group_cpu *groupc; |
504 | record_times(groupc, cpu, true); | 532 | |
505 | write_seqcount_end(&groupc->seq); | 533 | groupc = per_cpu_ptr(group->pcpu, cpu); |
534 | write_seqcount_begin(&groupc->seq); | ||
535 | record_times(groupc, cpu, true); | ||
536 | write_seqcount_end(&groupc->seq); | ||
537 | } | ||
506 | } | 538 | } |
507 | 539 | ||
508 | /** | 540 | /** |
@@ -565,8 +597,78 @@ void psi_memstall_leave(unsigned long *flags) | |||
565 | rq_unlock_irq(rq, &rf); | 597 | rq_unlock_irq(rq, &rf); |
566 | } | 598 | } |
567 | 599 | ||
568 | static int psi_show(struct seq_file *m, struct psi_group *group, | 600 | #ifdef CONFIG_CGROUPS |
569 | enum psi_res res) | 601 | int psi_cgroup_alloc(struct cgroup *cgroup) |
602 | { | ||
603 | if (psi_disabled) | ||
604 | return 0; | ||
605 | |||
606 | cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); | ||
607 | if (!cgroup->psi.pcpu) | ||
608 | return -ENOMEM; | ||
609 | group_init(&cgroup->psi); | ||
610 | return 0; | ||
611 | } | ||
612 | |||
613 | void psi_cgroup_free(struct cgroup *cgroup) | ||
614 | { | ||
615 | if (psi_disabled) | ||
616 | return; | ||
617 | |||
618 | cancel_delayed_work_sync(&cgroup->psi.clock_work); | ||
619 | free_percpu(cgroup->psi.pcpu); | ||
620 | } | ||
621 | |||
622 | /** | ||
623 | * cgroup_move_task - move task to a different cgroup | ||
624 | * @task: the task | ||
625 | * @to: the target css_set | ||
626 | * | ||
627 | * Move task to a new cgroup and safely migrate its associated stall | ||
628 | * state between the different groups. | ||
629 | * | ||
630 | * This function acquires the task's rq lock to lock out concurrent | ||
631 | * changes to the task's scheduling state and - in case the task is | ||
632 | * running - concurrent changes to its stall state. | ||
633 | */ | ||
634 | void cgroup_move_task(struct task_struct *task, struct css_set *to) | ||
635 | { | ||
636 | bool move_psi = !psi_disabled; | ||
637 | unsigned int task_flags = 0; | ||
638 | struct rq_flags rf; | ||
639 | struct rq *rq; | ||
640 | |||
641 | if (move_psi) { | ||
642 | rq = task_rq_lock(task, &rf); | ||
643 | |||
644 | if (task_on_rq_queued(task)) | ||
645 | task_flags = TSK_RUNNING; | ||
646 | else if (task->in_iowait) | ||
647 | task_flags = TSK_IOWAIT; | ||
648 | |||
649 | if (task->flags & PF_MEMSTALL) | ||
650 | task_flags |= TSK_MEMSTALL; | ||
651 | |||
652 | if (task_flags) | ||
653 | psi_task_change(task, task_flags, 0); | ||
654 | } | ||
655 | |||
656 | /* | ||
657 | * Lame to do this here, but the scheduler cannot be locked | ||
658 | * from the outside, so we move cgroups from inside sched/. | ||
659 | */ | ||
660 | rcu_assign_pointer(task->cgroups, to); | ||
661 | |||
662 | if (move_psi) { | ||
663 | if (task_flags) | ||
664 | psi_task_change(task, 0, task_flags); | ||
665 | |||
666 | task_rq_unlock(rq, task, &rf); | ||
667 | } | ||
668 | } | ||
669 | #endif /* CONFIG_CGROUPS */ | ||
670 | |||
671 | int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) | ||
570 | { | 672 | { |
571 | int full; | 673 | int full; |
572 | 674 | ||