aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Lezcano <daniel.lezcano@linaro.org>2017-06-23 10:11:08 -0400
committerThomas Gleixner <tglx@linutronix.de>2017-06-24 05:44:39 -0400
commite1c921495534002d727b15a76a2f8c20b6b108b5 (patch)
tree28500774c5b8ff667a304d9a74ead820590f85e3
parentb2d3d61adb7b73cfe5f82404f7a130a76fc64232 (diff)
genirq/timings: Add infrastructure for estimating the next interrupt arrival time
An interrupt behaves with a burst of activity with periodic interval of time followed by one or two peaks of longer interval. As the time intervals are periodic, statistically speaking they follow a normal distribution and each interrupts can be tracked individually. Add a mechanism to compute the statistics on all interrupts, except the timers which are deterministic from a prediction point of view, as their expiry time is known. The goal is to extract the periodicity for each interrupt, with the last timestamp and sum them, so the next event can be predicted to a certain extent. Taking the earliest prediction gives the expected wakeup on the system (assuming a timer won't expire before). Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Nicolas Pitre <nicolas.pitre@linaro.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Hannes Reinecke <hare@suse.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: "Rafael J . Wysocki" <rafael@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Bjorn Helgaas <bhelgaas@google.com> Link: http://lkml.kernel.org/r/1498227072-5980-2-git-send-email-daniel.lezcano@linaro.org
-rw-r--r--include/linux/interrupt.h1
-rw-r--r--kernel/irq/internals.h19
-rw-r--r--kernel/irq/timings.c339
3 files changed, 359 insertions, 0 deletions
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9f617238a2f7..37f8e354f564 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -706,6 +706,7 @@ static inline void init_irq_proc(void)
706#ifdef CONFIG_IRQ_TIMINGS 706#ifdef CONFIG_IRQ_TIMINGS
707void irq_timings_enable(void); 707void irq_timings_enable(void);
708void irq_timings_disable(void); 708void irq_timings_disable(void);
709u64 irq_timings_next_event(u64 now);
709#endif 710#endif
710 711
711struct seq_file; 712struct seq_file;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b95b74920433..9da14d125df4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -275,13 +275,21 @@ struct irq_timings {
275 275
276DECLARE_PER_CPU(struct irq_timings, irq_timings); 276DECLARE_PER_CPU(struct irq_timings, irq_timings);
277 277
278extern void irq_timings_free(int irq);
279extern int irq_timings_alloc(int irq);
280
278static inline void irq_remove_timings(struct irq_desc *desc) 281static inline void irq_remove_timings(struct irq_desc *desc)
279{ 282{
280 desc->istate &= ~IRQS_TIMINGS; 283 desc->istate &= ~IRQS_TIMINGS;
284
285 irq_timings_free(irq_desc_get_irq(desc));
281} 286}
282 287
283static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) 288static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act)
284{ 289{
290 int irq = irq_desc_get_irq(desc);
291 int ret;
292
285 /* 293 /*
286 * We don't need the measurement because the idle code already 294 * We don't need the measurement because the idle code already
287 * knows the next expiry event. 295 * knows the next expiry event.
@@ -289,6 +297,17 @@ static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *ac
289 if (act->flags & __IRQF_TIMER) 297 if (act->flags & __IRQF_TIMER)
290 return; 298 return;
291 299
300 /*
301 * In case the timing allocation fails, we just want to warn,
302 * not fail, so letting the system boot anyway.
303 */
304 ret = irq_timings_alloc(irq);
305 if (ret) {
306 pr_warn("Failed to allocate irq timing stats for irq%d (%d)",
307 irq, ret);
308 return;
309 }
310
292 desc->istate |= IRQS_TIMINGS; 311 desc->istate |= IRQS_TIMINGS;
293} 312}
294 313
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 56cf6870fa26..c8c1d073fbf1 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -8,10 +8,16 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 * 9 *
10 */ 10 */
11#include <linux/kernel.h>
11#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/slab.h>
12#include <linux/static_key.h> 14#include <linux/static_key.h>
13#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/idr.h>
14#include <linux/irq.h> 17#include <linux/irq.h>
18#include <linux/math64.h>
19
20#include <trace/events/irq.h>
15 21
16#include "internals.h" 22#include "internals.h"
17 23
@@ -19,6 +25,18 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
19 25
20DEFINE_PER_CPU(struct irq_timings, irq_timings); 26DEFINE_PER_CPU(struct irq_timings, irq_timings);
21 27
28struct irqt_stat {
29 u64 next_evt;
30 u64 last_ts;
31 u64 variance;
32 u32 avg;
33 u32 nr_samples;
34 int anomalies;
35 int valid;
36};
37
38static DEFINE_IDR(irqt_stats);
39
22void irq_timings_enable(void) 40void irq_timings_enable(void)
23{ 41{
24 static_branch_enable(&irq_timing_enabled); 42 static_branch_enable(&irq_timing_enabled);
@@ -28,3 +46,324 @@ void irq_timings_disable(void)
28{ 46{
29 static_branch_disable(&irq_timing_enabled); 47 static_branch_disable(&irq_timing_enabled);
30} 48}
49
50/**
51 * irqs_update - update the irq timing statistics with a new timestamp
52 *
53 * @irqs: an irqt_stat struct pointer
54 * @ts: the new timestamp
55 *
56 * The statistics are computed online, in other words, the code is
57 * designed to compute the statistics on a stream of values rather
58 * than doing multiple passes on the values to compute the average,
59 * then the variance. The integer division introduces a loss of
60 * precision but with an acceptable error margin regarding the results
61 * we would have with the double floating precision: we are dealing
62 * with nanosec, so big numbers, consequently the mantisse is
63 * negligeable, especially when converting the time in usec
64 * afterwards.
65 *
66 * The computation happens at idle time. When the CPU is not idle, the
67 * interrupts' timestamps are stored in the circular buffer, when the
68 * CPU goes idle and this routine is called, all the buffer's values
69 * are injected in the statistical model continuying to extend the
70 * statistics from the previous busy-idle cycle.
71 *
72 * The observations showed a device will trigger a burst of periodic
73 * interrupts followed by one or two peaks of longer time, for
74 * instance when a SD card device flushes its cache, then the periodic
75 * intervals occur again. A one second inactivity period resets the
76 * stats, that gives us the certitude the statistical values won't
77 * exceed 1x10^9, thus the computation won't overflow.
78 *
79 * Basically, the purpose of the algorithm is to watch the periodic
80 * interrupts and eliminate the peaks.
81 *
82 * An interrupt is considered periodically stable if the interval of
83 * its occurences follow the normal distribution, thus the values
84 * comply with:
85 *
86 * avg - 3 x stddev < value < avg + 3 x stddev
87 *
88 * Which can be simplified to:
89 *
90 * -3 x stddev < value - avg < 3 x stddev
91 *
92 * abs(value - avg) < 3 x stddev
93 *
94 * In order to save a costly square root computation, we use the
95 * variance. For the record, stddev = sqrt(variance). The equation
96 * above becomes:
97 *
98 * abs(value - avg) < 3 x sqrt(variance)
99 *
100 * And finally we square it:
101 *
102 * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
103 *
104 * (value - avg) x (value - avg) < 9 x variance
105 *
106 * Statistically speaking, any values out of this interval is
107 * considered as an anomaly and is discarded. However, a normal
108 * distribution appears when the number of samples is 30 (it is the
109 * rule of thumb in statistics, cf. "30 samples" on Internet). When
110 * there are three consecutive anomalies, the statistics are resetted.
111 *
112 */
113static void irqs_update(struct irqt_stat *irqs, u64 ts)
114{
115 u64 old_ts = irqs->last_ts;
116 u64 variance = 0;
117 u64 interval;
118 s64 diff;
119
120 /*
121 * The timestamps are absolute time values, we need to compute
122 * the timing interval between two interrupts.
123 */
124 irqs->last_ts = ts;
125
126 /*
127 * The interval type is u64 in order to deal with the same
128 * type in our computation, that prevent mindfuck issues with
129 * overflow, sign and division.
130 */
131 interval = ts - old_ts;
132
133 /*
134 * The interrupt triggered more than one second apart, that
135 * ends the sequence as predictible for our purpose. In this
136 * case, assume we have the beginning of a sequence and the
137 * timestamp is the first value. As it is impossible to
138 * predict anything at this point, return.
139 *
140 * Note the first timestamp of the sequence will always fall
141 * in this test because the old_ts is zero. That is what we
142 * want as we need another timestamp to compute an interval.
143 */
144 if (interval >= NSEC_PER_SEC) {
145 memset(irqs, 0, sizeof(*irqs));
146 irqs->last_ts = ts;
147 return;
148 }
149
150 /*
151 * Pre-compute the delta with the average as the result is
152 * used several times in this function.
153 */
154 diff = interval - irqs->avg;
155
156 /*
157 * Increment the number of samples.
158 */
159 irqs->nr_samples++;
160
161 /*
162 * Online variance divided by the number of elements if there
163 * is more than one sample. Normally the formula is division
164 * by nr_samples - 1 but we assume the number of element will be
165 * more than 32 and dividing by 32 instead of 31 is enough
166 * precise.
167 */
168 if (likely(irqs->nr_samples > 1))
169 variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
170
171 /*
172 * The rule of thumb in statistics for the normal distribution
173 * is having at least 30 samples in order to have the model to
174 * apply. Values outside the interval are considered as an
175 * anomaly.
176 */
177 if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
178 /*
179 * After three consecutive anomalies, we reset the
180 * stats as it is no longer stable enough.
181 */
182 if (irqs->anomalies++ >= 3) {
183 memset(irqs, 0, sizeof(*irqs));
184 irqs->last_ts = ts;
185 return;
186 }
187 } else {
188 /*
189 * The anomalies must be consecutives, so at this
190 * point, we reset the anomalies counter.
191 */
192 irqs->anomalies = 0;
193 }
194
195 /*
196 * The interrupt is considered stable enough to try to predict
197 * the next event on it.
198 */
199 irqs->valid = 1;
200
201 /*
202 * Online average algorithm:
203 *
204 * new_average = average + ((value - average) / count)
205 *
206 * The variance computation depends on the new average
207 * to be computed here first.
208 *
209 */
210 irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
211
212 /*
213 * Online variance algorithm:
214 *
215 * new_variance = variance + (value - average) x (value - new_average)
216 *
217 * Warning: irqs->avg is updated with the line above, hence
218 * 'interval - irqs->avg' is no longer equal to 'diff'
219 */
220 irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
221
222 /*
223 * Update the next event
224 */
225 irqs->next_evt = ts + irqs->avg;
226}
227
228/**
229 * irq_timings_next_event - Return when the next event is supposed to arrive
230 *
231 * During the last busy cycle, the number of interrupts is incremented
232 * and stored in the irq_timings structure. This information is
233 * necessary to:
234 *
235 * - know if the index in the table wrapped up:
236 *
237 * If more than the array size interrupts happened during the
238 * last busy/idle cycle, the index wrapped up and we have to
239 * begin with the next element in the array which is the last one
240 * in the sequence, otherwise it is a the index 0.
241 *
242 * - have an indication of the interrupts activity on this CPU
243 * (eg. irq/sec)
244 *
245 * The values are 'consumed' after inserting in the statistical model,
246 * thus the count is reinitialized.
247 *
248 * The array of values **must** be browsed in the time direction, the
249 * timestamp must increase between an element and the next one.
250 *
251 * Returns a nanosec time based estimation of the earliest interrupt,
252 * U64_MAX otherwise.
253 */
254u64 irq_timings_next_event(u64 now)
255{
256 struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
257 struct irqt_stat *irqs;
258 struct irqt_stat __percpu *s;
259 u64 ts, next_evt = U64_MAX;
260 int i, irq = 0;
261
262 /*
263 * This function must be called with the local irq disabled in
264 * order to prevent the timings circular buffer to be updated
265 * while we are reading it.
266 */
267 WARN_ON_ONCE(!irqs_disabled());
268
269 /*
270 * Number of elements in the circular buffer: If it happens it
271 * was flushed before, then the number of elements could be
272 * smaller than IRQ_TIMINGS_SIZE, so the count is used,
273 * otherwise the array size is used as we wrapped. The index
274 * begins from zero when we did not wrap. That could be done
275 * in a nicer way with the proper circular array structure
276 * type but with the cost of extra computation in the
277 * interrupt handler hot path. We choose efficiency.
278 *
279 * Inject measured irq/timestamp to the statistical model
280 * while decrementing the counter because we consume the data
281 * from our circular buffer.
282 */
283 for (i = irqts->count & IRQ_TIMINGS_MASK,
284 irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
285 irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
286
287 irq = irq_timing_decode(irqts->values[i], &ts);
288
289 s = idr_find(&irqt_stats, irq);
290 if (s) {
291 irqs = this_cpu_ptr(s);
292 irqs_update(irqs, ts);
293 }
294 }
295
296 /*
297 * Look in the list of interrupts' statistics, the earliest
298 * next event.
299 */
300 idr_for_each_entry(&irqt_stats, s, i) {
301
302 irqs = this_cpu_ptr(s);
303
304 if (!irqs->valid)
305 continue;
306
307 if (irqs->next_evt <= now) {
308 irq = i;
309 next_evt = now;
310
311 /*
312 * This interrupt mustn't use in the future
313 * until new events occur and update the
314 * statistics.
315 */
316 irqs->valid = 0;
317 break;
318 }
319
320 if (irqs->next_evt < next_evt) {
321 irq = i;
322 next_evt = irqs->next_evt;
323 }
324 }
325
326 return next_evt;
327}
328
329void irq_timings_free(int irq)
330{
331 struct irqt_stat __percpu *s;
332
333 s = idr_find(&irqt_stats, irq);
334 if (s) {
335 free_percpu(s);
336 idr_remove(&irqt_stats, irq);
337 }
338}
339
340int irq_timings_alloc(int irq)
341{
342 struct irqt_stat __percpu *s;
343 int id;
344
345 /*
346 * Some platforms can have the same private interrupt per cpu,
347 * so this function may be be called several times with the
348 * same interrupt number. Just bail out in case the per cpu
349 * stat structure is already allocated.
350 */
351 s = idr_find(&irqt_stats, irq);
352 if (s)
353 return 0;
354
355 s = alloc_percpu(*s);
356 if (!s)
357 return -ENOMEM;
358
359 idr_preload(GFP_KERNEL);
360 id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
361 idr_preload_end();
362
363 if (id < 0) {
364 free_percpu(s);
365 return id;
366 }
367
368 return 0;
369}