diff options
Diffstat (limited to 'kernel/irq/timings.c')
-rw-r--r-- | kernel/irq/timings.c | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c new file mode 100644 index 000000000000..c8c1d073fbf1 --- /dev/null +++ b/kernel/irq/timings.c | |||
@@ -0,0 +1,369 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/timings.c | ||
3 | * | ||
4 | * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | * | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/percpu.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/static_key.h> | ||
15 | #include <linux/interrupt.h> | ||
16 | #include <linux/idr.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/math64.h> | ||
19 | |||
20 | #include <trace/events/irq.h> | ||
21 | |||
22 | #include "internals.h" | ||
23 | |||
24 | DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); | ||
25 | |||
26 | DEFINE_PER_CPU(struct irq_timings, irq_timings); | ||
27 | |||
28 | struct irqt_stat { | ||
29 | u64 next_evt; | ||
30 | u64 last_ts; | ||
31 | u64 variance; | ||
32 | u32 avg; | ||
33 | u32 nr_samples; | ||
34 | int anomalies; | ||
35 | int valid; | ||
36 | }; | ||
37 | |||
38 | static DEFINE_IDR(irqt_stats); | ||
39 | |||
40 | void irq_timings_enable(void) | ||
41 | { | ||
42 | static_branch_enable(&irq_timing_enabled); | ||
43 | } | ||
44 | |||
45 | void irq_timings_disable(void) | ||
46 | { | ||
47 | static_branch_disable(&irq_timing_enabled); | ||
48 | } | ||
49 | |||
50 | /** | ||
51 | * irqs_update - update the irq timing statistics with a new timestamp | ||
52 | * | ||
53 | * @irqs: an irqt_stat struct pointer | ||
54 | * @ts: the new timestamp | ||
55 | * | ||
56 | * The statistics are computed online, in other words, the code is | ||
57 | * designed to compute the statistics on a stream of values rather | ||
58 | * than doing multiple passes on the values to compute the average, | ||
59 | * then the variance. The integer division introduces a loss of | ||
60 | * precision but with an acceptable error margin regarding the results | ||
61 | * we would have with the double floating precision: we are dealing | ||
62 | * with nanosec, so big numbers, consequently the mantisse is | ||
63 | * negligeable, especially when converting the time in usec | ||
64 | * afterwards. | ||
65 | * | ||
66 | * The computation happens at idle time. When the CPU is not idle, the | ||
67 | * interrupts' timestamps are stored in the circular buffer, when the | ||
68 | * CPU goes idle and this routine is called, all the buffer's values | ||
69 | * are injected in the statistical model continuying to extend the | ||
70 | * statistics from the previous busy-idle cycle. | ||
71 | * | ||
72 | * The observations showed a device will trigger a burst of periodic | ||
73 | * interrupts followed by one or two peaks of longer time, for | ||
74 | * instance when a SD card device flushes its cache, then the periodic | ||
75 | * intervals occur again. A one second inactivity period resets the | ||
76 | * stats, that gives us the certitude the statistical values won't | ||
77 | * exceed 1x10^9, thus the computation won't overflow. | ||
78 | * | ||
79 | * Basically, the purpose of the algorithm is to watch the periodic | ||
80 | * interrupts and eliminate the peaks. | ||
81 | * | ||
82 | * An interrupt is considered periodically stable if the interval of | ||
83 | * its occurences follow the normal distribution, thus the values | ||
84 | * comply with: | ||
85 | * | ||
86 | * avg - 3 x stddev < value < avg + 3 x stddev | ||
87 | * | ||
88 | * Which can be simplified to: | ||
89 | * | ||
90 | * -3 x stddev < value - avg < 3 x stddev | ||
91 | * | ||
92 | * abs(value - avg) < 3 x stddev | ||
93 | * | ||
94 | * In order to save a costly square root computation, we use the | ||
95 | * variance. For the record, stddev = sqrt(variance). The equation | ||
96 | * above becomes: | ||
97 | * | ||
98 | * abs(value - avg) < 3 x sqrt(variance) | ||
99 | * | ||
100 | * And finally we square it: | ||
101 | * | ||
102 | * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 | ||
103 | * | ||
104 | * (value - avg) x (value - avg) < 9 x variance | ||
105 | * | ||
106 | * Statistically speaking, any values out of this interval is | ||
107 | * considered as an anomaly and is discarded. However, a normal | ||
108 | * distribution appears when the number of samples is 30 (it is the | ||
109 | * rule of thumb in statistics, cf. "30 samples" on Internet). When | ||
110 | * there are three consecutive anomalies, the statistics are resetted. | ||
111 | * | ||
112 | */ | ||
113 | static void irqs_update(struct irqt_stat *irqs, u64 ts) | ||
114 | { | ||
115 | u64 old_ts = irqs->last_ts; | ||
116 | u64 variance = 0; | ||
117 | u64 interval; | ||
118 | s64 diff; | ||
119 | |||
120 | /* | ||
121 | * The timestamps are absolute time values, we need to compute | ||
122 | * the timing interval between two interrupts. | ||
123 | */ | ||
124 | irqs->last_ts = ts; | ||
125 | |||
126 | /* | ||
127 | * The interval type is u64 in order to deal with the same | ||
128 | * type in our computation, that prevent mindfuck issues with | ||
129 | * overflow, sign and division. | ||
130 | */ | ||
131 | interval = ts - old_ts; | ||
132 | |||
133 | /* | ||
134 | * The interrupt triggered more than one second apart, that | ||
135 | * ends the sequence as predictible for our purpose. In this | ||
136 | * case, assume we have the beginning of a sequence and the | ||
137 | * timestamp is the first value. As it is impossible to | ||
138 | * predict anything at this point, return. | ||
139 | * | ||
140 | * Note the first timestamp of the sequence will always fall | ||
141 | * in this test because the old_ts is zero. That is what we | ||
142 | * want as we need another timestamp to compute an interval. | ||
143 | */ | ||
144 | if (interval >= NSEC_PER_SEC) { | ||
145 | memset(irqs, 0, sizeof(*irqs)); | ||
146 | irqs->last_ts = ts; | ||
147 | return; | ||
148 | } | ||
149 | |||
150 | /* | ||
151 | * Pre-compute the delta with the average as the result is | ||
152 | * used several times in this function. | ||
153 | */ | ||
154 | diff = interval - irqs->avg; | ||
155 | |||
156 | /* | ||
157 | * Increment the number of samples. | ||
158 | */ | ||
159 | irqs->nr_samples++; | ||
160 | |||
161 | /* | ||
162 | * Online variance divided by the number of elements if there | ||
163 | * is more than one sample. Normally the formula is division | ||
164 | * by nr_samples - 1 but we assume the number of element will be | ||
165 | * more than 32 and dividing by 32 instead of 31 is enough | ||
166 | * precise. | ||
167 | */ | ||
168 | if (likely(irqs->nr_samples > 1)) | ||
169 | variance = irqs->variance >> IRQ_TIMINGS_SHIFT; | ||
170 | |||
171 | /* | ||
172 | * The rule of thumb in statistics for the normal distribution | ||
173 | * is having at least 30 samples in order to have the model to | ||
174 | * apply. Values outside the interval are considered as an | ||
175 | * anomaly. | ||
176 | */ | ||
177 | if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { | ||
178 | /* | ||
179 | * After three consecutive anomalies, we reset the | ||
180 | * stats as it is no longer stable enough. | ||
181 | */ | ||
182 | if (irqs->anomalies++ >= 3) { | ||
183 | memset(irqs, 0, sizeof(*irqs)); | ||
184 | irqs->last_ts = ts; | ||
185 | return; | ||
186 | } | ||
187 | } else { | ||
188 | /* | ||
189 | * The anomalies must be consecutives, so at this | ||
190 | * point, we reset the anomalies counter. | ||
191 | */ | ||
192 | irqs->anomalies = 0; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * The interrupt is considered stable enough to try to predict | ||
197 | * the next event on it. | ||
198 | */ | ||
199 | irqs->valid = 1; | ||
200 | |||
201 | /* | ||
202 | * Online average algorithm: | ||
203 | * | ||
204 | * new_average = average + ((value - average) / count) | ||
205 | * | ||
206 | * The variance computation depends on the new average | ||
207 | * to be computed here first. | ||
208 | * | ||
209 | */ | ||
210 | irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); | ||
211 | |||
212 | /* | ||
213 | * Online variance algorithm: | ||
214 | * | ||
215 | * new_variance = variance + (value - average) x (value - new_average) | ||
216 | * | ||
217 | * Warning: irqs->avg is updated with the line above, hence | ||
218 | * 'interval - irqs->avg' is no longer equal to 'diff' | ||
219 | */ | ||
220 | irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); | ||
221 | |||
222 | /* | ||
223 | * Update the next event | ||
224 | */ | ||
225 | irqs->next_evt = ts + irqs->avg; | ||
226 | } | ||
227 | |||
228 | /** | ||
229 | * irq_timings_next_event - Return when the next event is supposed to arrive | ||
230 | * | ||
231 | * During the last busy cycle, the number of interrupts is incremented | ||
232 | * and stored in the irq_timings structure. This information is | ||
233 | * necessary to: | ||
234 | * | ||
235 | * - know if the index in the table wrapped up: | ||
236 | * | ||
237 | * If more than the array size interrupts happened during the | ||
238 | * last busy/idle cycle, the index wrapped up and we have to | ||
239 | * begin with the next element in the array which is the last one | ||
240 | * in the sequence, otherwise it is a the index 0. | ||
241 | * | ||
242 | * - have an indication of the interrupts activity on this CPU | ||
243 | * (eg. irq/sec) | ||
244 | * | ||
245 | * The values are 'consumed' after inserting in the statistical model, | ||
246 | * thus the count is reinitialized. | ||
247 | * | ||
248 | * The array of values **must** be browsed in the time direction, the | ||
249 | * timestamp must increase between an element and the next one. | ||
250 | * | ||
251 | * Returns a nanosec time based estimation of the earliest interrupt, | ||
252 | * U64_MAX otherwise. | ||
253 | */ | ||
254 | u64 irq_timings_next_event(u64 now) | ||
255 | { | ||
256 | struct irq_timings *irqts = this_cpu_ptr(&irq_timings); | ||
257 | struct irqt_stat *irqs; | ||
258 | struct irqt_stat __percpu *s; | ||
259 | u64 ts, next_evt = U64_MAX; | ||
260 | int i, irq = 0; | ||
261 | |||
262 | /* | ||
263 | * This function must be called with the local irq disabled in | ||
264 | * order to prevent the timings circular buffer to be updated | ||
265 | * while we are reading it. | ||
266 | */ | ||
267 | WARN_ON_ONCE(!irqs_disabled()); | ||
268 | |||
269 | /* | ||
270 | * Number of elements in the circular buffer: If it happens it | ||
271 | * was flushed before, then the number of elements could be | ||
272 | * smaller than IRQ_TIMINGS_SIZE, so the count is used, | ||
273 | * otherwise the array size is used as we wrapped. The index | ||
274 | * begins from zero when we did not wrap. That could be done | ||
275 | * in a nicer way with the proper circular array structure | ||
276 | * type but with the cost of extra computation in the | ||
277 | * interrupt handler hot path. We choose efficiency. | ||
278 | * | ||
279 | * Inject measured irq/timestamp to the statistical model | ||
280 | * while decrementing the counter because we consume the data | ||
281 | * from our circular buffer. | ||
282 | */ | ||
283 | for (i = irqts->count & IRQ_TIMINGS_MASK, | ||
284 | irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); | ||
285 | irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { | ||
286 | |||
287 | irq = irq_timing_decode(irqts->values[i], &ts); | ||
288 | |||
289 | s = idr_find(&irqt_stats, irq); | ||
290 | if (s) { | ||
291 | irqs = this_cpu_ptr(s); | ||
292 | irqs_update(irqs, ts); | ||
293 | } | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * Look in the list of interrupts' statistics, the earliest | ||
298 | * next event. | ||
299 | */ | ||
300 | idr_for_each_entry(&irqt_stats, s, i) { | ||
301 | |||
302 | irqs = this_cpu_ptr(s); | ||
303 | |||
304 | if (!irqs->valid) | ||
305 | continue; | ||
306 | |||
307 | if (irqs->next_evt <= now) { | ||
308 | irq = i; | ||
309 | next_evt = now; | ||
310 | |||
311 | /* | ||
312 | * This interrupt mustn't use in the future | ||
313 | * until new events occur and update the | ||
314 | * statistics. | ||
315 | */ | ||
316 | irqs->valid = 0; | ||
317 | break; | ||
318 | } | ||
319 | |||
320 | if (irqs->next_evt < next_evt) { | ||
321 | irq = i; | ||
322 | next_evt = irqs->next_evt; | ||
323 | } | ||
324 | } | ||
325 | |||
326 | return next_evt; | ||
327 | } | ||
328 | |||
329 | void irq_timings_free(int irq) | ||
330 | { | ||
331 | struct irqt_stat __percpu *s; | ||
332 | |||
333 | s = idr_find(&irqt_stats, irq); | ||
334 | if (s) { | ||
335 | free_percpu(s); | ||
336 | idr_remove(&irqt_stats, irq); | ||
337 | } | ||
338 | } | ||
339 | |||
340 | int irq_timings_alloc(int irq) | ||
341 | { | ||
342 | struct irqt_stat __percpu *s; | ||
343 | int id; | ||
344 | |||
345 | /* | ||
346 | * Some platforms can have the same private interrupt per cpu, | ||
347 | * so this function may be be called several times with the | ||
348 | * same interrupt number. Just bail out in case the per cpu | ||
349 | * stat structure is already allocated. | ||
350 | */ | ||
351 | s = idr_find(&irqt_stats, irq); | ||
352 | if (s) | ||
353 | return 0; | ||
354 | |||
355 | s = alloc_percpu(*s); | ||
356 | if (!s) | ||
357 | return -ENOMEM; | ||
358 | |||
359 | idr_preload(GFP_KERNEL); | ||
360 | id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT); | ||
361 | idr_preload_end(); | ||
362 | |||
363 | if (id < 0) { | ||
364 | free_percpu(s); | ||
365 | return id; | ||
366 | } | ||
367 | |||
368 | return 0; | ||
369 | } | ||