summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSuren Baghdasaryan <surenb@google.com>2019-05-14 18:41:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-14 22:52:48 -0400
commit0e94682b73bfa6c44c98af7a26771c9c08c055d5 (patch)
treec9bb857be3cef50fc3dbdc319c514dd4d7e93238
parent8af0c18af1425fc70686c0fdcfc0072cd8431aa0 (diff)
psi: introduce psi monitor
Psi monitor aims to provide a low-latency short-term pressure detection mechanism configurable by users. It allows users to monitor psi metrics growth and trigger events whenever a metric raises above user-defined threshold within user-defined time window. Time window and threshold are both expressed in usecs. Multiple psi resources with different thresholds and window sizes can be monitored concurrently. Psi monitors activate when system enters stall state for the monitored psi metric and deactivate upon exit from the stall state. While system is in the stall state psi signal growth is monitored at a rate of 10 times per tracking window. Min window size is 500ms, therefore the min monitoring interval is 50ms. Max window size is 10s with monitoring interval of 1s. When activated psi monitor stays active for at least the duration of one tracking window to avoid repeated activations/deactivations when psi signal is bouncing. Notifications to the users are rate-limited to one per tracking window. Link: http://lkml.kernel.org/r/20190319235619.260832-8-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Dennis Zhou <dennis@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/accounting/psi.txt107
-rw-r--r--include/linux/psi.h8
-rw-r--r--include/linux/psi_types.h82
-rw-r--r--kernel/cgroup/cgroup.c71
-rw-r--r--kernel/sched/psi.c494
5 files changed, 742 insertions, 20 deletions
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
index 7e71c9c1d8e9..5cbe5659e3b7 100644
--- a/Documentation/accounting/psi.txt
+++ b/Documentation/accounting/psi.txt
@@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time
63spikes which wouldn't necessarily make a dent in the time averages, 63spikes which wouldn't necessarily make a dent in the time averages,
64or to average trends over custom time frames. 64or to average trends over custom time frames.
65 65
66Monitoring for pressure thresholds
67==================================
68
69Users can register triggers and use poll() to be woken up when resource
70pressure exceeds certain thresholds.
71
72A trigger describes the maximum cumulative stall time over a specific
73time window, e.g. 100ms of total stall time within any 500ms window to
74generate a wakeup event.
75
76To register a trigger user has to open psi interface file under
77/proc/pressure/ representing the resource to be monitored and write the
78desired threshold and time window. The open file descriptor should be
79used to wait for trigger events using select(), poll() or epoll().
80The following format is used:
81
82<some|full> <stall amount in us> <time window in us>
83
84For example writing "some 150000 1000000" into /proc/pressure/memory
85would add 150ms threshold for partial memory stall measured within
861sec time window. Writing "full 50000 1000000" into /proc/pressure/io
87would add 50ms threshold for full io stall measured within 1sec time window.
88
89Triggers can be set on more than one psi metric and more than one trigger
90for the same psi metric can be specified. However for each trigger a separate
91file descriptor is required to be able to poll it separately from others,
92therefore for each trigger a separate open() syscall should be made even
93when opening the same psi interface file.
94
95Monitors activate only when system enters stall state for the monitored
96psi metric and deactivates upon exit from the stall state. While system is
97in the stall state psi signal growth is monitored at a rate of 10 times per
98tracking window.
99
100The kernel accepts window sizes ranging from 500ms to 10s, therefore min
101monitoring update interval is 50ms and max is 1s. Min limit is set to
102prevent overly frequent polling. Max limit is chosen as a high enough number
103after which monitors are most likely not needed and psi averages can be used
104instead.
105
106When activated, psi monitor stays active for at least the duration of one
107tracking window to avoid repeated activations/deactivations when system is
108bouncing in and out of the stall state.
109
110Notifications to the userspace are rate-limited to one per tracking window.
111
112The trigger will de-register when the file descriptor used to define the
113trigger is closed.
114
115Userspace monitor usage example
116===============================
117
118#include <errno.h>
119#include <fcntl.h>
120#include <stdio.h>
121#include <poll.h>
122#include <string.h>
123#include <unistd.h>
124
125/*
126 * Monitor memory partial stall with 1s tracking window size
127 * and 150ms threshold.
128 */
129int main() {
130 const char trig[] = "some 150000 1000000";
131 struct pollfd fds;
132 int n;
133
134 fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
135 if (fds.fd < 0) {
136 printf("/proc/pressure/memory open error: %s\n",
137 strerror(errno));
138 return 1;
139 }
140 fds.events = POLLPRI;
141
142 if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
143 printf("/proc/pressure/memory write error: %s\n",
144 strerror(errno));
145 return 1;
146 }
147
148 printf("waiting for events...\n");
149 while (1) {
150 n = poll(&fds, 1, -1);
151 if (n < 0) {
152 printf("poll error: %s\n", strerror(errno));
153 return 1;
154 }
155 if (fds.revents & POLLERR) {
156 printf("got POLLERR, event source is gone\n");
157 return 0;
158 }
159 if (fds.revents & POLLPRI) {
160 printf("event triggered!\n");
161 } else {
162 printf("unknown event received: 0x%x\n", fds.revents);
163 return 1;
164 }
165 }
166
167 return 0;
168}
169
66Cgroup2 interface 170Cgroup2 interface
67================= 171=================
68 172
@@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped
71into cgroups. Each subdirectory in the cgroupfs mountpoint contains 175into cgroups. Each subdirectory in the cgroupfs mountpoint contains
72cpu.pressure, memory.pressure, and io.pressure files; the format is 176cpu.pressure, memory.pressure, and io.pressure files; the format is
73the same as the /proc/pressure/ files. 177the same as the /proc/pressure/ files.
178
179Per-cgroup psi monitors can be specified and used the same way as
180system-wide ones.
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7006008d5b72..af892c290116 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -4,6 +4,7 @@
4#include <linux/jump_label.h> 4#include <linux/jump_label.h>
5#include <linux/psi_types.h> 5#include <linux/psi_types.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/poll.h>
7 8
8struct seq_file; 9struct seq_file;
9struct css_set; 10struct css_set;
@@ -26,6 +27,13 @@ int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
26int psi_cgroup_alloc(struct cgroup *cgrp); 27int psi_cgroup_alloc(struct cgroup *cgrp);
27void psi_cgroup_free(struct cgroup *cgrp); 28void psi_cgroup_free(struct cgroup *cgrp);
28void cgroup_move_task(struct task_struct *p, struct css_set *to); 29void cgroup_move_task(struct task_struct *p, struct css_set *to);
30
31struct psi_trigger *psi_trigger_create(struct psi_group *group,
32 char *buf, size_t nbytes, enum psi_res res);
33void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t);
34
35__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
36 poll_table *wait);
29#endif 37#endif
30 38
31#else /* CONFIG_PSI */ 39#else /* CONFIG_PSI */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 4d1c1f67be18..07aaf9b82241 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -1,8 +1,11 @@
1#ifndef _LINUX_PSI_TYPES_H 1#ifndef _LINUX_PSI_TYPES_H
2#define _LINUX_PSI_TYPES_H 2#define _LINUX_PSI_TYPES_H
3 3
4#include <linux/kthread.h>
4#include <linux/seqlock.h> 5#include <linux/seqlock.h>
5#include <linux/types.h> 6#include <linux/types.h>
7#include <linux/kref.h>
8#include <linux/wait.h>
6 9
7#ifdef CONFIG_PSI 10#ifdef CONFIG_PSI
8 11
@@ -44,6 +47,12 @@ enum psi_states {
44 NR_PSI_STATES = 6, 47 NR_PSI_STATES = 6,
45}; 48};
46 49
50enum psi_aggregators {
51 PSI_AVGS = 0,
52 PSI_POLL,
53 NR_PSI_AGGREGATORS,
54};
55
47struct psi_group_cpu { 56struct psi_group_cpu {
48 /* 1st cacheline updated by the scheduler */ 57 /* 1st cacheline updated by the scheduler */
49 58
@@ -65,7 +74,55 @@ struct psi_group_cpu {
65 /* 2nd cacheline updated by the aggregator */ 74 /* 2nd cacheline updated by the aggregator */
66 75
67 /* Delta detection against the sampling buckets */ 76 /* Delta detection against the sampling buckets */
68 u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; 77 u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
78 ____cacheline_aligned_in_smp;
79};
80
81/* PSI growth tracking window */
82struct psi_window {
83 /* Window size in ns */
84 u64 size;
85
86 /* Start time of the current window in ns */
87 u64 start_time;
88
89 /* Value at the start of the window */
90 u64 start_value;
91
92 /* Value growth in the previous window */
93 u64 prev_growth;
94};
95
96struct psi_trigger {
97 /* PSI state being monitored by the trigger */
98 enum psi_states state;
99
100 /* User-spacified threshold in ns */
101 u64 threshold;
102
103 /* List node inside triggers list */
104 struct list_head node;
105
106 /* Backpointer needed during trigger destruction */
107 struct psi_group *group;
108
109 /* Wait queue for polling */
110 wait_queue_head_t event_wait;
111
112 /* Pending event flag */
113 int event;
114
115 /* Tracking window */
116 struct psi_window win;
117
118 /*
119 * Time last event was generated. Used for rate-limiting
120 * events to one per window
121 */
122 u64 last_event_time;
123
124 /* Refcounting to prevent premature destruction */
125 struct kref refcount;
69}; 126};
70 127
71struct psi_group { 128struct psi_group {
@@ -79,11 +136,32 @@ struct psi_group {
79 u64 avg_total[NR_PSI_STATES - 1]; 136 u64 avg_total[NR_PSI_STATES - 1];
80 u64 avg_last_update; 137 u64 avg_last_update;
81 u64 avg_next_update; 138 u64 avg_next_update;
139
140 /* Aggregator work control */
82 struct delayed_work avgs_work; 141 struct delayed_work avgs_work;
83 142
84 /* Total stall times and sampled pressure averages */ 143 /* Total stall times and sampled pressure averages */
85 u64 total[NR_PSI_STATES - 1]; 144 u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
86 unsigned long avg[NR_PSI_STATES - 1][3]; 145 unsigned long avg[NR_PSI_STATES - 1][3];
146
147 /* Monitor work control */
148 atomic_t poll_scheduled;
149 struct kthread_worker __rcu *poll_kworker;
150 struct kthread_delayed_work poll_work;
151
152 /* Protects data used by the monitor */
153 struct mutex trigger_lock;
154
155 /* Configured polling triggers */
156 struct list_head triggers;
157 u32 nr_triggers[NR_PSI_STATES - 1];
158 u32 poll_states;
159 u64 poll_min_period;
160
161 /* Total stall times at the start of monitor activation */
162 u64 polling_total[NR_PSI_STATES - 1];
163 u64 polling_next_update;
164 u64 polling_until;
87}; 165};
88 166
89#else /* CONFIG_PSI */ 167#else /* CONFIG_PSI */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 327f37c9fdfa..1140357d46f4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3550,7 +3550,65 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3550{ 3550{
3551 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); 3551 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
3552} 3552}
3553#endif 3553
3554static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3555 size_t nbytes, enum psi_res res)
3556{
3557 struct psi_trigger *new;
3558 struct cgroup *cgrp;
3559
3560 cgrp = cgroup_kn_lock_live(of->kn, false);
3561 if (!cgrp)
3562 return -ENODEV;
3563
3564 cgroup_get(cgrp);
3565 cgroup_kn_unlock(of->kn);
3566
3567 new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
3568 if (IS_ERR(new)) {
3569 cgroup_put(cgrp);
3570 return PTR_ERR(new);
3571 }
3572
3573 psi_trigger_replace(&of->priv, new);
3574
3575 cgroup_put(cgrp);
3576
3577 return nbytes;
3578}
3579
3580static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3581 char *buf, size_t nbytes,
3582 loff_t off)
3583{
3584 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3585}
3586
3587static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3588 char *buf, size_t nbytes,
3589 loff_t off)
3590{
3591 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3592}
3593
3594static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3595 char *buf, size_t nbytes,
3596 loff_t off)
3597{
3598 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3599}
3600
3601static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3602 poll_table *pt)
3603{
3604 return psi_trigger_poll(&of->priv, of->file, pt);
3605}
3606
3607static void cgroup_pressure_release(struct kernfs_open_file *of)
3608{
3609 psi_trigger_replace(&of->priv, NULL);
3610}
3611#endif /* CONFIG_PSI */
3554 3612
3555static int cgroup_freeze_show(struct seq_file *seq, void *v) 3613static int cgroup_freeze_show(struct seq_file *seq, void *v)
3556{ 3614{
@@ -4745,18 +4803,27 @@ static struct cftype cgroup_base_files[] = {
4745 .name = "io.pressure", 4803 .name = "io.pressure",
4746 .flags = CFTYPE_NOT_ON_ROOT, 4804 .flags = CFTYPE_NOT_ON_ROOT,
4747 .seq_show = cgroup_io_pressure_show, 4805 .seq_show = cgroup_io_pressure_show,
4806 .write = cgroup_io_pressure_write,
4807 .poll = cgroup_pressure_poll,
4808 .release = cgroup_pressure_release,
4748 }, 4809 },
4749 { 4810 {
4750 .name = "memory.pressure", 4811 .name = "memory.pressure",
4751 .flags = CFTYPE_NOT_ON_ROOT, 4812 .flags = CFTYPE_NOT_ON_ROOT,
4752 .seq_show = cgroup_memory_pressure_show, 4813 .seq_show = cgroup_memory_pressure_show,
4814 .write = cgroup_memory_pressure_write,
4815 .poll = cgroup_pressure_poll,
4816 .release = cgroup_pressure_release,
4753 }, 4817 },
4754 { 4818 {
4755 .name = "cpu.pressure", 4819 .name = "cpu.pressure",
4756 .flags = CFTYPE_NOT_ON_ROOT, 4820 .flags = CFTYPE_NOT_ON_ROOT,
4757 .seq_show = cgroup_cpu_pressure_show, 4821 .seq_show = cgroup_cpu_pressure_show,
4822 .write = cgroup_cpu_pressure_write,
4823 .poll = cgroup_pressure_poll,
4824 .release = cgroup_pressure_release,
4758 }, 4825 },
4759#endif 4826#endif /* CONFIG_PSI */
4760 { } /* terminate */ 4827 { } /* terminate */
4761}; 4828};
4762 4829
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1b99eeffaa25..e88918e0bb6d 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
4 * Copyright (c) 2018 Facebook, Inc. 4 * Copyright (c) 2018 Facebook, Inc.
5 * Author: Johannes Weiner <hannes@cmpxchg.org> 5 * Author: Johannes Weiner <hannes@cmpxchg.org>
6 * 6 *
7 * Polling support by Suren Baghdasaryan <surenb@google.com>
8 * Copyright (c) 2018 Google, Inc.
9 *
7 * When CPU, memory and IO are contended, tasks experience delays that 10 * When CPU, memory and IO are contended, tasks experience delays that
8 * reduce throughput and introduce latencies into the workload. Memory 11 * reduce throughput and introduce latencies into the workload. Memory
9 * and IO contention, in addition, can cause a full loss of forward 12 * and IO contention, in addition, can cause a full loss of forward
@@ -129,9 +132,13 @@
129#include <linux/seq_file.h> 132#include <linux/seq_file.h>
130#include <linux/proc_fs.h> 133#include <linux/proc_fs.h>
131#include <linux/seqlock.h> 134#include <linux/seqlock.h>
135#include <linux/uaccess.h>
132#include <linux/cgroup.h> 136#include <linux/cgroup.h>
133#include <linux/module.h> 137#include <linux/module.h>
134#include <linux/sched.h> 138#include <linux/sched.h>
139#include <linux/ctype.h>
140#include <linux/file.h>
141#include <linux/poll.h>
135#include <linux/psi.h> 142#include <linux/psi.h>
136#include "sched.h" 143#include "sched.h"
137 144
@@ -156,6 +163,11 @@ __setup("psi=", setup_psi);
156#define EXP_60s 1981 /* 1/exp(2s/60s) */ 163#define EXP_60s 1981 /* 1/exp(2s/60s) */
157#define EXP_300s 2034 /* 1/exp(2s/300s) */ 164#define EXP_300s 2034 /* 1/exp(2s/300s) */
158 165
166/* PSI trigger definitions */
167#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
168#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
169#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
170
159/* Sampling frequency in nanoseconds */ 171/* Sampling frequency in nanoseconds */
160static u64 psi_period __read_mostly; 172static u64 psi_period __read_mostly;
161 173
@@ -176,6 +188,17 @@ static void group_init(struct psi_group *group)
176 group->avg_next_update = sched_clock() + psi_period; 188 group->avg_next_update = sched_clock() + psi_period;
177 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); 189 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
178 mutex_init(&group->avgs_lock); 190 mutex_init(&group->avgs_lock);
191 /* Init trigger-related members */
192 atomic_set(&group->poll_scheduled, 0);
193 mutex_init(&group->trigger_lock);
194 INIT_LIST_HEAD(&group->triggers);
195 memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
196 group->poll_states = 0;
197 group->poll_min_period = U32_MAX;
198 memset(group->polling_total, 0, sizeof(group->polling_total));
199 group->polling_next_update = ULLONG_MAX;
200 group->polling_until = 0;
201 rcu_assign_pointer(group->poll_kworker, NULL);
179} 202}
180 203
181void __init psi_init(void) 204void __init psi_init(void)
@@ -210,7 +233,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
210 } 233 }
211} 234}
212 235
213static void get_recent_times(struct psi_group *group, int cpu, u32 *times, 236static void get_recent_times(struct psi_group *group, int cpu,
237 enum psi_aggregators aggregator, u32 *times,
214 u32 *pchanged_states) 238 u32 *pchanged_states)
215{ 239{
216 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); 240 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -245,8 +269,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
245 if (state_mask & (1 << s)) 269 if (state_mask & (1 << s))
246 times[s] += now - state_start; 270 times[s] += now - state_start;
247 271
248 delta = times[s] - groupc->times_prev[s]; 272 delta = times[s] - groupc->times_prev[aggregator][s];
249 groupc->times_prev[s] = times[s]; 273 groupc->times_prev[aggregator][s] = times[s];
250 274
251 times[s] = delta; 275 times[s] = delta;
252 if (delta) 276 if (delta)
@@ -274,7 +298,9 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
274 avg[2] = calc_load(avg[2], EXP_300s, pct); 298 avg[2] = calc_load(avg[2], EXP_300s, pct);
275} 299}
276 300
277static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) 301static void collect_percpu_times(struct psi_group *group,
302 enum psi_aggregators aggregator,
303 u32 *pchanged_states)
278{ 304{
279 u64 deltas[NR_PSI_STATES - 1] = { 0, }; 305 u64 deltas[NR_PSI_STATES - 1] = { 0, };
280 unsigned long nonidle_total = 0; 306 unsigned long nonidle_total = 0;
@@ -295,7 +321,7 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
295 u32 nonidle; 321 u32 nonidle;
296 u32 cpu_changed_states; 322 u32 cpu_changed_states;
297 323
298 get_recent_times(group, cpu, times, 324 get_recent_times(group, cpu, aggregator, times,
299 &cpu_changed_states); 325 &cpu_changed_states);
300 changed_states |= cpu_changed_states; 326 changed_states |= cpu_changed_states;
301 327
@@ -320,7 +346,8 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
320 346
321 /* total= */ 347 /* total= */
322 for (s = 0; s < NR_PSI_STATES - 1; s++) 348 for (s = 0; s < NR_PSI_STATES - 1; s++)
323 group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); 349 group->total[aggregator][s] +=
350 div_u64(deltas[s], max(nonidle_total, 1UL));
324 351
325 if (pchanged_states) 352 if (pchanged_states)
326 *pchanged_states = changed_states; 353 *pchanged_states = changed_states;
@@ -352,7 +379,7 @@ static u64 update_averages(struct psi_group *group, u64 now)
352 for (s = 0; s < NR_PSI_STATES - 1; s++) { 379 for (s = 0; s < NR_PSI_STATES - 1; s++) {
353 u32 sample; 380 u32 sample;
354 381
355 sample = group->total[s] - group->avg_total[s]; 382 sample = group->total[PSI_AVGS][s] - group->avg_total[s];
356 /* 383 /*
357 * Due to the lockless sampling of the time buckets, 384 * Due to the lockless sampling of the time buckets,
358 * recorded time deltas can slip into the next period, 385 * recorded time deltas can slip into the next period,
@@ -394,7 +421,7 @@ static void psi_avgs_work(struct work_struct *work)
394 421
395 now = sched_clock(); 422 now = sched_clock();
396 423
397 collect_percpu_times(group, &changed_states); 424 collect_percpu_times(group, PSI_AVGS, &changed_states);
398 nonidle = changed_states & (1 << PSI_NONIDLE); 425 nonidle = changed_states & (1 << PSI_NONIDLE);
399 /* 426 /*
400 * If there is task activity, periodically fold the per-cpu 427 * If there is task activity, periodically fold the per-cpu
@@ -414,6 +441,187 @@ static void psi_avgs_work(struct work_struct *work)
414 mutex_unlock(&group->avgs_lock); 441 mutex_unlock(&group->avgs_lock);
415} 442}
416 443
444/* Trigger tracking window manupulations */
445static void window_reset(struct psi_window *win, u64 now, u64 value,
446 u64 prev_growth)
447{
448 win->start_time = now;
449 win->start_value = value;
450 win->prev_growth = prev_growth;
451}
452
453/*
454 * PSI growth tracking window update and growth calculation routine.
455 *
456 * This approximates a sliding tracking window by interpolating
457 * partially elapsed windows using historical growth data from the
458 * previous intervals. This minimizes memory requirements (by not storing
459 * all the intermediate values in the previous window) and simplifies
460 * the calculations. It works well because PSI signal changes only in
461 * positive direction and over relatively small window sizes the growth
462 * is close to linear.
463 */
464static u64 window_update(struct psi_window *win, u64 now, u64 value)
465{
466 u64 elapsed;
467 u64 growth;
468
469 elapsed = now - win->start_time;
470 growth = value - win->start_value;
471 /*
472 * After each tracking window passes win->start_value and
473 * win->start_time get reset and win->prev_growth stores
474 * the average per-window growth of the previous window.
475 * win->prev_growth is then used to interpolate additional
476 * growth from the previous window assuming it was linear.
477 */
478 if (elapsed > win->size)
479 window_reset(win, now, value, growth);
480 else {
481 u32 remaining;
482
483 remaining = win->size - elapsed;
484 growth += div_u64(win->prev_growth * remaining, win->size);
485 }
486
487 return growth;
488}
489
490static void init_triggers(struct psi_group *group, u64 now)
491{
492 struct psi_trigger *t;
493
494 list_for_each_entry(t, &group->triggers, node)
495 window_reset(&t->win, now,
496 group->total[PSI_POLL][t->state], 0);
497 memcpy(group->polling_total, group->total[PSI_POLL],
498 sizeof(group->polling_total));
499 group->polling_next_update = now + group->poll_min_period;
500}
501
502static u64 update_triggers(struct psi_group *group, u64 now)
503{
504 struct psi_trigger *t;
505 bool new_stall = false;
506 u64 *total = group->total[PSI_POLL];
507
508 /*
509 * On subsequent updates, calculate growth deltas and let
510 * watchers know when their specified thresholds are exceeded.
511 */
512 list_for_each_entry(t, &group->triggers, node) {
513 u64 growth;
514
515 /* Check for stall activity */
516 if (group->polling_total[t->state] == total[t->state])
517 continue;
518
519 /*
520 * Multiple triggers might be looking at the same state,
521 * remember to update group->polling_total[] once we've
522 * been through all of them. Also remember to extend the
523 * polling time if we see new stall activity.
524 */
525 new_stall = true;
526
527 /* Calculate growth since last update */
528 growth = window_update(&t->win, now, total[t->state]);
529 if (growth < t->threshold)
530 continue;
531
532 /* Limit event signaling to once per window */
533 if (now < t->last_event_time + t->win.size)
534 continue;
535
536 /* Generate an event */
537 if (cmpxchg(&t->event, 0, 1) == 0)
538 wake_up_interruptible(&t->event_wait);
539 t->last_event_time = now;
540 }
541
542 if (new_stall)
543 memcpy(group->polling_total, total,
544 sizeof(group->polling_total));
545
546 return now + group->poll_min_period;
547}
548
549/*
550 * Schedule polling if it's not already scheduled. It's safe to call even from
551 * hotpath because even though kthread_queue_delayed_work takes worker->lock
552 * spinlock that spinlock is never contended due to poll_scheduled atomic
553 * preventing such competition.
554 */
555static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
556{
557 struct kthread_worker *kworker;
558
559 /* Do not reschedule if already scheduled */
560 if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
561 return;
562
563 rcu_read_lock();
564
565 kworker = rcu_dereference(group->poll_kworker);
566 /*
567 * kworker might be NULL in case psi_trigger_destroy races with
568 * psi_task_change (hotpath) which can't use locks
569 */
570 if (likely(kworker))
571 kthread_queue_delayed_work(kworker, &group->poll_work, delay);
572 else
573 atomic_set(&group->poll_scheduled, 0);
574
575 rcu_read_unlock();
576}
577
578static void psi_poll_work(struct kthread_work *work)
579{
580 struct kthread_delayed_work *dwork;
581 struct psi_group *group;
582 u32 changed_states;
583 u64 now;
584
585 dwork = container_of(work, struct kthread_delayed_work, work);
586 group = container_of(dwork, struct psi_group, poll_work);
587
588 atomic_set(&group->poll_scheduled, 0);
589
590 mutex_lock(&group->trigger_lock);
591
592 now = sched_clock();
593
594 collect_percpu_times(group, PSI_POLL, &changed_states);
595
596 if (changed_states & group->poll_states) {
597 /* Initialize trigger windows when entering polling mode */
598 if (now > group->polling_until)
599 init_triggers(group, now);
600
601 /*
602 * Keep the monitor active for at least the duration of the
603 * minimum tracking window as long as monitor states are
604 * changing.
605 */
606 group->polling_until = now +
607 group->poll_min_period * UPDATES_PER_WINDOW;
608 }
609
610 if (now > group->polling_until) {
611 group->polling_next_update = ULLONG_MAX;
612 goto out;
613 }
614
615 if (now >= group->polling_next_update)
616 group->polling_next_update = update_triggers(group, now);
617
618 psi_schedule_poll_work(group,
619 nsecs_to_jiffies(group->polling_next_update - now) + 1);
620
621out:
622 mutex_unlock(&group->trigger_lock);
623}
624
417static void record_times(struct psi_group_cpu *groupc, int cpu, 625static void record_times(struct psi_group_cpu *groupc, int cpu,
418 bool memstall_tick) 626 bool memstall_tick)
419{ 627{
@@ -460,8 +668,8 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
460 groupc->times[PSI_NONIDLE] += delta; 668 groupc->times[PSI_NONIDLE] += delta;
461} 669}
462 670
463static void psi_group_change(struct psi_group *group, int cpu, 671static u32 psi_group_change(struct psi_group *group, int cpu,
464 unsigned int clear, unsigned int set) 672 unsigned int clear, unsigned int set)
465{ 673{
466 struct psi_group_cpu *groupc; 674 struct psi_group_cpu *groupc;
467 unsigned int t, m; 675 unsigned int t, m;
@@ -507,6 +715,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
507 groupc->state_mask = state_mask; 715 groupc->state_mask = state_mask;
508 716
509 write_seqcount_end(&groupc->seq); 717 write_seqcount_end(&groupc->seq);
718
719 return state_mask;
510} 720}
511 721
512static struct psi_group *iterate_groups(struct task_struct *task, void **iter) 722static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -567,7 +777,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
567 wake_clock = false; 777 wake_clock = false;
568 778
569 while ((group = iterate_groups(task, &iter))) { 779 while ((group = iterate_groups(task, &iter))) {
570 psi_group_change(group, cpu, clear, set); 780 u32 state_mask = psi_group_change(group, cpu, clear, set);
781
782 if (state_mask & group->poll_states)
783 psi_schedule_poll_work(group, 1);
784
571 if (wake_clock && !delayed_work_pending(&group->avgs_work)) 785 if (wake_clock && !delayed_work_pending(&group->avgs_work))
572 schedule_delayed_work(&group->avgs_work, PSI_FREQ); 786 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
573 } 787 }
@@ -668,6 +882,8 @@ void psi_cgroup_free(struct cgroup *cgroup)
668 882
669 cancel_delayed_work_sync(&cgroup->psi.avgs_work); 883 cancel_delayed_work_sync(&cgroup->psi.avgs_work);
670 free_percpu(cgroup->psi.pcpu); 884 free_percpu(cgroup->psi.pcpu);
885 /* All triggers must be removed by now */
886 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
671} 887}
672 888
673/** 889/**
@@ -731,7 +947,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
731 /* Update averages before reporting them */ 947 /* Update averages before reporting them */
732 mutex_lock(&group->avgs_lock); 948 mutex_lock(&group->avgs_lock);
733 now = sched_clock(); 949 now = sched_clock();
734 collect_percpu_times(group, NULL); 950 collect_percpu_times(group, PSI_AVGS, NULL);
735 if (now >= group->avg_next_update) 951 if (now >= group->avg_next_update)
736 group->avg_next_update = update_averages(group, now); 952 group->avg_next_update = update_averages(group, now);
737 mutex_unlock(&group->avgs_lock); 953 mutex_unlock(&group->avgs_lock);
@@ -743,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
743 959
744 for (w = 0; w < 3; w++) 960 for (w = 0; w < 3; w++)
745 avg[w] = group->avg[res * 2 + full][w]; 961 avg[w] = group->avg[res * 2 + full][w];
746 total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); 962 total = div_u64(group->total[PSI_AVGS][res * 2 + full],
963 NSEC_PER_USEC);
747 964
748 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", 965 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
749 full ? "full" : "some", 966 full ? "full" : "some",
@@ -786,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
786 return single_open(file, psi_cpu_show, NULL); 1003 return single_open(file, psi_cpu_show, NULL);
787} 1004}
788 1005
1006struct psi_trigger *psi_trigger_create(struct psi_group *group,
1007 char *buf, size_t nbytes, enum psi_res res)
1008{
1009 struct psi_trigger *t;
1010 enum psi_states state;
1011 u32 threshold_us;
1012 u32 window_us;
1013
1014 if (static_branch_likely(&psi_disabled))
1015 return ERR_PTR(-EOPNOTSUPP);
1016
1017 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1018 state = PSI_IO_SOME + res * 2;
1019 else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1020 state = PSI_IO_FULL + res * 2;
1021 else
1022 return ERR_PTR(-EINVAL);
1023
1024 if (state >= PSI_NONIDLE)
1025 return ERR_PTR(-EINVAL);
1026
1027 if (window_us < WINDOW_MIN_US ||
1028 window_us > WINDOW_MAX_US)
1029 return ERR_PTR(-EINVAL);
1030
1031 /* Check threshold */
1032 if (threshold_us == 0 || threshold_us > window_us)
1033 return ERR_PTR(-EINVAL);
1034
1035 t = kmalloc(sizeof(*t), GFP_KERNEL);
1036 if (!t)
1037 return ERR_PTR(-ENOMEM);
1038
1039 t->group = group;
1040 t->state = state;
1041 t->threshold = threshold_us * NSEC_PER_USEC;
1042 t->win.size = window_us * NSEC_PER_USEC;
1043 window_reset(&t->win, 0, 0, 0);
1044
1045 t->event = 0;
1046 t->last_event_time = 0;
1047 init_waitqueue_head(&t->event_wait);
1048 kref_init(&t->refcount);
1049
1050 mutex_lock(&group->trigger_lock);
1051
1052 if (!rcu_access_pointer(group->poll_kworker)) {
1053 struct sched_param param = {
1054 .sched_priority = MAX_RT_PRIO - 1,
1055 };
1056 struct kthread_worker *kworker;
1057
1058 kworker = kthread_create_worker(0, "psimon");
1059 if (IS_ERR(kworker)) {
1060 kfree(t);
1061 mutex_unlock(&group->trigger_lock);
1062 return ERR_CAST(kworker);
1063 }
1064 sched_setscheduler(kworker->task, SCHED_FIFO, &param);
1065 kthread_init_delayed_work(&group->poll_work,
1066 psi_poll_work);
1067 rcu_assign_pointer(group->poll_kworker, kworker);
1068 }
1069
1070 list_add(&t->node, &group->triggers);
1071 group->poll_min_period = min(group->poll_min_period,
1072 div_u64(t->win.size, UPDATES_PER_WINDOW));
1073 group->nr_triggers[t->state]++;
1074 group->poll_states |= (1 << t->state);
1075
1076 mutex_unlock(&group->trigger_lock);
1077
1078 return t;
1079}
1080
1081static void psi_trigger_destroy(struct kref *ref)
1082{
1083 struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
1084 struct psi_group *group = t->group;
1085 struct kthread_worker *kworker_to_destroy = NULL;
1086
1087 if (static_branch_likely(&psi_disabled))
1088 return;
1089
1090 /*
1091 * Wakeup waiters to stop polling. Can happen if cgroup is deleted
1092 * from under a polling process.
1093 */
1094 wake_up_interruptible(&t->event_wait);
1095
1096 mutex_lock(&group->trigger_lock);
1097
1098 if (!list_empty(&t->node)) {
1099 struct psi_trigger *tmp;
1100 u64 period = ULLONG_MAX;
1101
1102 list_del(&t->node);
1103 group->nr_triggers[t->state]--;
1104 if (!group->nr_triggers[t->state])
1105 group->poll_states &= ~(1 << t->state);
1106 /* reset min update period for the remaining triggers */
1107 list_for_each_entry(tmp, &group->triggers, node)
1108 period = min(period, div_u64(tmp->win.size,
1109 UPDATES_PER_WINDOW));
1110 group->poll_min_period = period;
1111 /* Destroy poll_kworker when the last trigger is destroyed */
1112 if (group->poll_states == 0) {
1113 group->polling_until = 0;
1114 kworker_to_destroy = rcu_dereference_protected(
1115 group->poll_kworker,
1116 lockdep_is_held(&group->trigger_lock));
1117 rcu_assign_pointer(group->poll_kworker, NULL);
1118 }
1119 }
1120
1121 mutex_unlock(&group->trigger_lock);
1122
1123 /*
1124 * Wait for both *trigger_ptr from psi_trigger_replace and
1125 * poll_kworker RCUs to complete their read-side critical sections
1126 * before destroying the trigger and optionally the poll_kworker
1127 */
1128 synchronize_rcu();
1129 /*
1130 * Destroy the kworker after releasing trigger_lock to prevent a
1131 * deadlock while waiting for psi_poll_work to acquire trigger_lock
1132 */
1133 if (kworker_to_destroy) {
1134 kthread_cancel_delayed_work_sync(&group->poll_work);
1135 kthread_destroy_worker(kworker_to_destroy);
1136 }
1137 kfree(t);
1138}
1139
1140void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
1141{
1142 struct psi_trigger *old = *trigger_ptr;
1143
1144 if (static_branch_likely(&psi_disabled))
1145 return;
1146
1147 rcu_assign_pointer(*trigger_ptr, new);
1148 if (old)
1149 kref_put(&old->refcount, psi_trigger_destroy);
1150}
1151
1152__poll_t psi_trigger_poll(void **trigger_ptr,
1153 struct file *file, poll_table *wait)
1154{
1155 __poll_t ret = DEFAULT_POLLMASK;
1156 struct psi_trigger *t;
1157
1158 if (static_branch_likely(&psi_disabled))
1159 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1160
1161 rcu_read_lock();
1162
1163 t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
1164 if (!t) {
1165 rcu_read_unlock();
1166 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1167 }
1168 kref_get(&t->refcount);
1169
1170 rcu_read_unlock();
1171
1172 poll_wait(file, &t->event_wait, wait);
1173
1174 if (cmpxchg(&t->event, 1, 0) == 1)
1175 ret |= EPOLLPRI;
1176
1177 kref_put(&t->refcount, psi_trigger_destroy);
1178
1179 return ret;
1180}
1181
1182static ssize_t psi_write(struct file *file, const char __user *user_buf,
1183 size_t nbytes, enum psi_res res)
1184{
1185 char buf[32];
1186 size_t buf_size;
1187 struct seq_file *seq;
1188 struct psi_trigger *new;
1189
1190 if (static_branch_likely(&psi_disabled))
1191 return -EOPNOTSUPP;
1192
1193 buf_size = min(nbytes, (sizeof(buf) - 1));
1194 if (copy_from_user(buf, user_buf, buf_size))
1195 return -EFAULT;
1196
1197 buf[buf_size - 1] = '\0';
1198
1199 new = psi_trigger_create(&psi_system, buf, nbytes, res);
1200 if (IS_ERR(new))
1201 return PTR_ERR(new);
1202
1203 seq = file->private_data;
1204 /* Take seq->lock to protect seq->private from concurrent writes */
1205 mutex_lock(&seq->lock);
1206 psi_trigger_replace(&seq->private, new);
1207 mutex_unlock(&seq->lock);
1208
1209 return nbytes;
1210}
1211
1212static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1213 size_t nbytes, loff_t *ppos)
1214{
1215 return psi_write(file, user_buf, nbytes, PSI_IO);
1216}
1217
1218static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1219 size_t nbytes, loff_t *ppos)
1220{
1221 return psi_write(file, user_buf, nbytes, PSI_MEM);
1222}
1223
1224static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1225 size_t nbytes, loff_t *ppos)
1226{
1227 return psi_write(file, user_buf, nbytes, PSI_CPU);
1228}
1229
1230static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1231{
1232 struct seq_file *seq = file->private_data;
1233
1234 return psi_trigger_poll(&seq->private, file, wait);
1235}
1236
1237static int psi_fop_release(struct inode *inode, struct file *file)
1238{
1239 struct seq_file *seq = file->private_data;
1240
1241 psi_trigger_replace(&seq->private, NULL);
1242 return single_release(inode, file);
1243}
1244
789static const struct file_operations psi_io_fops = { 1245static const struct file_operations psi_io_fops = {
790 .open = psi_io_open, 1246 .open = psi_io_open,
791 .read = seq_read, 1247 .read = seq_read,
792 .llseek = seq_lseek, 1248 .llseek = seq_lseek,
793 .release = single_release, 1249 .write = psi_io_write,
1250 .poll = psi_fop_poll,
1251 .release = psi_fop_release,
794}; 1252};
795 1253
796static const struct file_operations psi_memory_fops = { 1254static const struct file_operations psi_memory_fops = {
797 .open = psi_memory_open, 1255 .open = psi_memory_open,
798 .read = seq_read, 1256 .read = seq_read,
799 .llseek = seq_lseek, 1257 .llseek = seq_lseek,
800 .release = single_release, 1258 .write = psi_memory_write,
1259 .poll = psi_fop_poll,
1260 .release = psi_fop_release,
801}; 1261};
802 1262
803static const struct file_operations psi_cpu_fops = { 1263static const struct file_operations psi_cpu_fops = {
804 .open = psi_cpu_open, 1264 .open = psi_cpu_open,
805 .read = seq_read, 1265 .read = seq_read,
806 .llseek = seq_lseek, 1266 .llseek = seq_lseek,
807 .release = single_release, 1267 .write = psi_cpu_write,
1268 .poll = psi_fop_poll,
1269 .release = psi_fop_release,
808}; 1270};
809 1271
810static int __init psi_proc_init(void) 1272static int __init psi_proc_init(void)