1 files changed, 369 insertions, 0 deletions
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
new file mode 100644
index 000000000000..c8c1d073fbf1
--- /dev/null
+++ b/kernel/irq/timings.c
@@ -0,0 +1,369 @@
+/*
+ * linux/kernel/irq/timings.c
+ *
+ * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/static_key.h>
+#include <linux/interrupt.h>
+#include <linux/idr.h>
+#include <linux/irq.h>
+#include <linux/math64.h>
+#include <trace/events/irq.h>
+#include "internals.h"
+DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
+DEFINE_PER_CPU(struct irq_timings, irq_timings);
+struct irqt_stat {
+        u64     next_evt;
+        u64     last_ts;
+        u64     variance;
+        u32     avg;
+        u32     nr_samples;
+        int     anomalies;
+        int     valid;
+};
+static DEFINE_IDR(irqt_stats);
+void irq_timings_enable(void)
+{
+        static_branch_enable(&irq_timing_enabled);
+}
+void irq_timings_disable(void)
+{
+        static_branch_disable(&irq_timing_enabled);
+}
+/**
+ * irqs_update - update the irq timing statistics with a new timestamp
+ *
+ * @irqs: an irqt_stat struct pointer
+ * @ts: the new timestamp
+ *
+ * The statistics are computed online, in other words, the code is
+ * designed to compute the statistics on a stream of values rather
+ * than doing multiple passes on the values to compute the average,
+ * then the variance. The integer division introduces a loss of
+ * precision but with an acceptable error margin regarding the results
+ * we would have with the double floating precision: we are dealing
+ * with nanosec, so big numbers, consequently the mantisse is
+ * negligeable, especially when converting the time in usec
+ * afterwards.
+ *
+ * The computation happens at idle time. When the CPU is not idle, the
+ * interrupts' timestamps are stored in the circular buffer, when the
+ * CPU goes idle and this routine is called, all the buffer's values
+ * are injected in the statistical model continuying to extend the
+ * statistics from the previous busy-idle cycle.
+ *
+ * The observations showed a device will trigger a burst of periodic
+ * interrupts followed by one or two peaks of longer time, for
+ * instance when a SD card device flushes its cache, then the periodic
+ * intervals occur again. A one second inactivity period resets the
+ * stats, that gives us the certitude the statistical values won't
+ * exceed 1x10^9, thus the computation won't overflow.
+ *
+ * Basically, the purpose of the algorithm is to watch the periodic
+ * interrupts and eliminate the peaks.
+ *
+ * An interrupt is considered periodically stable if the interval of
+ * its occurences follow the normal distribution, thus the values
+ * comply with:
+ *
+ *      avg - 3 x stddev < value < avg + 3 x stddev
+ *
+ * Which can be simplified to:
+ *
+ *      -3 x stddev < value - avg < 3 x stddev
+ *
+ *      abs(value - avg) < 3 x stddev
+ *
+ * In order to save a costly square root computation, we use the
+ * variance. For the record, stddev = sqrt(variance). The equation
+ * above becomes:
+ *
+ *      abs(value - avg) < 3 x sqrt(variance)
+ *
+ * And finally we square it:
+ *
+ *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
+ *
+ *      (value - avg) x (value - avg) < 9 x variance
+ *
+ * Statistically speaking, any values out of this interval is
+ * considered as an anomaly and is discarded. However, a normal
+ * distribution appears when the number of samples is 30 (it is the
+ * rule of thumb in statistics, cf. "30 samples" on Internet). When
+ * there are three consecutive anomalies, the statistics are resetted.
+ *
+ */
+static void irqs_update(struct irqt_stat *irqs, u64 ts)
+{
+        u64 old_ts = irqs->last_ts;
+        u64 variance = 0;
+        u64 interval;
+        s64 diff;
+        /*
+         * The timestamps are absolute time values, we need to compute
+         * the timing interval between two interrupts.
+         */
+        irqs->last_ts = ts;
+        /*
+         * The interval type is u64 in order to deal with the same
+         * type in our computation, that prevent mindfuck issues with
+         * overflow, sign and division.
+         */
+        interval = ts - old_ts;
+        /*
+         * The interrupt triggered more than one second apart, that
+         * ends the sequence as predictible for our purpose. In this
+         * case, assume we have the beginning of a sequence and the
+         * timestamp is the first value. As it is impossible to
+         * predict anything at this point, return.
+         *
+         * Note the first timestamp of the sequence will always fall
+         * in this test because the old_ts is zero. That is what we
+         * want as we need another timestamp to compute an interval.
+         */
+        if (interval >= NSEC_PER_SEC) {
+                memset(irqs, 0, sizeof(*irqs));
+                irqs->last_ts = ts;
+                return;
+        }
+        /*
+         * Pre-compute the delta with the average as the result is
+         * used several times in this function.
+         */
+        diff = interval - irqs->avg;
+        /*
+         * Increment the number of samples.
+         */
+        irqs->nr_samples++;
+        /*
+         * Online variance divided by the number of elements if there
+         * is more than one sample.  Normally the formula is division
+         * by nr_samples - 1 but we assume the number of element will be
+         * more than 32 and dividing by 32 instead of 31 is enough
+         * precise.
+         */
+        if (likely(irqs->nr_samples > 1))
+                variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
+        /*
+         * The rule of thumb in statistics for the normal distribution
+         * is having at least 30 samples in order to have the model to
+         * apply. Values outside the interval are considered as an
+         * anomaly.
+         */
+        if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
+                /*
+                 * After three consecutive anomalies, we reset the
+                 * stats as it is no longer stable enough.
+                 */
+                if (irqs->anomalies++ >= 3) {
+                        memset(irqs, 0, sizeof(*irqs));
+                        irqs->last_ts = ts;
+                        return;
+                }
+        } else {
+                /*
+                 * The anomalies must be consecutives, so at this
+                 * point, we reset the anomalies counter.
+                 */
+                irqs->anomalies = 0;
+        }
+        /*
+         * The interrupt is considered stable enough to try to predict
+         * the next event on it.
+         */
+        irqs->valid = 1;
+        /*
+         * Online average algorithm:
+         *
+         *  new_average = average + ((value - average) / count)
+         *
+         * The variance computation depends on the new average
+         * to be computed here first.
+         *
+         */
+        irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
+        /*
+         * Online variance algorithm:
+         *
+         *  new_variance = variance + (value - average) x (value - new_average)
+         *
+         * Warning: irqs->avg is updated with the line above, hence
+         * 'interval - irqs->avg' is no longer equal to 'diff'
+         */
+        irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
+        /*
+         * Update the next event
+         */
+        irqs->next_evt = ts + irqs->avg;
+}
+/**
+ * irq_timings_next_event - Return when the next event is supposed to arrive
+ *
+ * During the last busy cycle, the number of interrupts is incremented
+ * and stored in the irq_timings structure. This information is
+ * necessary to:
+ *
+ * - know if the index in the table wrapped up:
+ *
+ *      If more than the array size interrupts happened during the
+ *      last busy/idle cycle, the index wrapped up and we have to
+ *      begin with the next element in the array which is the last one
+ *      in the sequence, otherwise it is a the index 0.
+ *
+ * - have an indication of the interrupts activity on this CPU
+ *   (eg. irq/sec)
+ *
+ * The values are 'consumed' after inserting in the statistical model,
+ * thus the count is reinitialized.
+ *
+ * The array of values **must** be browsed in the time direction, the
+ * timestamp must increase between an element and the next one.
+ *
+ * Returns a nanosec time based estimation of the earliest interrupt,
+ * U64_MAX otherwise.
+ */
+u64 irq_timings_next_event(u64 now)
+{
+        struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+        struct irqt_stat *irqs;
+        struct irqt_stat __percpu *s;
+        u64 ts, next_evt = U64_MAX;
+        int i, irq = 0;
+        /*
+         * This function must be called with the local irq disabled in
+         * order to prevent the timings circular buffer to be updated
+         * while we are reading it.
+         */
+        WARN_ON_ONCE(!irqs_disabled());
+        /*
+         * Number of elements in the circular buffer: If it happens it
+         * was flushed before, then the number of elements could be
+         * smaller than IRQ_TIMINGS_SIZE, so the count is used,
+         * otherwise the array size is used as we wrapped. The index
+         * begins from zero when we did not wrap. That could be done
+         * in a nicer way with the proper circular array structure
+         * type but with the cost of extra computation in the
+         * interrupt handler hot path. We choose efficiency.
+         *
+         * Inject measured irq/timestamp to the statistical model
+         * while decrementing the counter because we consume the data
+         * from our circular buffer.
+         */
+        for (i = irqts->count & IRQ_TIMINGS_MASK,
+                     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
+             irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+                irq = irq_timing_decode(irqts->values[i], &ts);
+                s = idr_find(&irqt_stats, irq);
+                if (s) {
+                        irqs = this_cpu_ptr(s);
+                        irqs_update(irqs, ts);
+                }
+        }
+        /*
+         * Look in the list of interrupts' statistics, the earliest
+         * next event.
+         */
+        idr_for_each_entry(&irqt_stats, s, i) {
+                irqs = this_cpu_ptr(s);
+                if (!irqs->valid)
+                        continue;
+                if (irqs->next_evt <= now) {
+                        irq = i;
+                        next_evt = now;
+                        /*
+                         * This interrupt mustn't use in the future
+                         * until new events occur and update the
+                         * statistics.
+                         */
+                        irqs->valid = 0;
+                        break;
+                }
+                if (irqs->next_evt < next_evt) {
+                        irq = i;
+                        next_evt = irqs->next_evt;
+                }
+        }
+        return next_evt;
+}
+void irq_timings_free(int irq)
+{
+        struct irqt_stat __percpu *s;
+        s = idr_find(&irqt_stats, irq);
+        if (s) {
+                free_percpu(s);
+                idr_remove(&irqt_stats, irq);
+        }
+}
+int irq_timings_alloc(int irq)
+{
+        struct irqt_stat __percpu *s;
+        int id;
+        /*
+         * Some platforms can have the same private interrupt per cpu,
+         * so this function may be be called several times with the
+         * same interrupt number. Just bail out in case the per cpu
+         * stat structure is already allocated.
+         */
+        s = idr_find(&irqt_stats, irq);
+        if (s)
+                return 0;
+        s = alloc_percpu(*s);
+        if (!s)
+                return -ENOMEM;
+        idr_preload(GFP_KERNEL);
+        id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
+        idr_preload_end();
+        if (id < 0) {
+                free_percpu(s);
+                return id;
+        }
+        return 0;
+}

diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c new file mode 100644 index 000000000000..c8c1d073fbf1 --- /dev/null +++ b/kernel/irq/timings.c
@@ -0,0 +1,369 @@
	1	/*
	2	* linux/kernel/irq/timings.c
	3	*
	4	* Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License version 2 as
	8	* published by the Free Software Foundation.
	9	*
	10	*/
	11	#include <linux/kernel.h>
	12	#include <linux/percpu.h>
	13	#include <linux/slab.h>
	14	#include <linux/static_key.h>
	15	#include <linux/interrupt.h>
	16	#include <linux/idr.h>
	17	#include <linux/irq.h>
	18	#include <linux/math64.h>
	19
	20	#include <trace/events/irq.h>
	21
	22	#include "internals.h"
	23
	24	DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
	25
	26	DEFINE_PER_CPU(struct irq_timings, irq_timings);
	27
	28	struct irqt_stat {
	29	u64 next_evt;
	30	u64 last_ts;
	31	u64 variance;
	32	u32 avg;
	33	u32 nr_samples;
	34	int anomalies;
	35	int valid;
	36	};
	37
	38	static DEFINE_IDR(irqt_stats);
	39
	40	void irq_timings_enable(void)
	41	{
	42	static_branch_enable(&irq_timing_enabled);
	43	}
	44
	45	void irq_timings_disable(void)
	46	{
	47	static_branch_disable(&irq_timing_enabled);
	48	}
	49
	50	/**
	51	* irqs_update - update the irq timing statistics with a new timestamp
	52	*
	53	* @irqs: an irqt_stat struct pointer
	54	* @ts: the new timestamp
	55	*
	56	* The statistics are computed online, in other words, the code is
	57	* designed to compute the statistics on a stream of values rather
	58	* than doing multiple passes on the values to compute the average,
	59	* then the variance. The integer division introduces a loss of
	60	* precision but with an acceptable error margin regarding the results
	61	* we would have with the double floating precision: we are dealing
	62	* with nanosec, so big numbers, consequently the mantisse is
	63	* negligeable, especially when converting the time in usec
	64	* afterwards.
	65	*
	66	* The computation happens at idle time. When the CPU is not idle, the
	67	* interrupts' timestamps are stored in the circular buffer, when the
	68	* CPU goes idle and this routine is called, all the buffer's values
	69	* are injected in the statistical model continuying to extend the
	70	* statistics from the previous busy-idle cycle.
	71	*
	72	* The observations showed a device will trigger a burst of periodic
	73	* interrupts followed by one or two peaks of longer time, for
	74	* instance when a SD card device flushes its cache, then the periodic
	75	* intervals occur again. A one second inactivity period resets the
	76	* stats, that gives us the certitude the statistical values won't
	77	* exceed 1x10^9, thus the computation won't overflow.
	78	*
	79	* Basically, the purpose of the algorithm is to watch the periodic
	80	* interrupts and eliminate the peaks.
	81	*
	82	* An interrupt is considered periodically stable if the interval of
	83	* its occurences follow the normal distribution, thus the values
	84	* comply with:
	85	*
	86	* avg - 3 x stddev < value < avg + 3 x stddev
	87	*
	88	* Which can be simplified to:
	89	*
	90	* -3 x stddev < value - avg < 3 x stddev
	91	*
	92	* abs(value - avg) < 3 x stddev
	93	*
	94	* In order to save a costly square root computation, we use the
	95	* variance. For the record, stddev = sqrt(variance). The equation
	96	* above becomes:
	97	*
	98	* abs(value - avg) < 3 x sqrt(variance)
	99	*
	100	* And finally we square it:
	101	*
	102	* (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
	103	*
	104	* (value - avg) x (value - avg) < 9 x variance
	105	*
	106	* Statistically speaking, any values out of this interval is
	107	* considered as an anomaly and is discarded. However, a normal
	108	* distribution appears when the number of samples is 30 (it is the
	109	* rule of thumb in statistics, cf. "30 samples" on Internet). When
	110	* there are three consecutive anomalies, the statistics are resetted.
	111	*
	112	*/
	113	static void irqs_update(struct irqt_stat *irqs, u64 ts)
	114	{
	115	u64 old_ts = irqs->last_ts;
	116	u64 variance = 0;
	117	u64 interval;
	118	s64 diff;
	119
	120	/*
	121	* The timestamps are absolute time values, we need to compute
	122	* the timing interval between two interrupts.
	123	*/
	124	irqs->last_ts = ts;
	125
	126	/*
	127	* The interval type is u64 in order to deal with the same
	128	* type in our computation, that prevent mindfuck issues with
	129	* overflow, sign and division.
	130	*/
	131	interval = ts - old_ts;
	132
	133	/*
	134	* The interrupt triggered more than one second apart, that
	135	* ends the sequence as predictible for our purpose. In this
	136	* case, assume we have the beginning of a sequence and the
	137	* timestamp is the first value. As it is impossible to
	138	* predict anything at this point, return.
	139	*
	140	* Note the first timestamp of the sequence will always fall
	141	* in this test because the old_ts is zero. That is what we
	142	* want as we need another timestamp to compute an interval.
	143	*/
	144	if (interval >= NSEC_PER_SEC) {
	145	memset(irqs, 0, sizeof(*irqs));
	146	irqs->last_ts = ts;
	147	return;
	148	}
	149
	150	/*
	151	* Pre-compute the delta with the average as the result is
	152	* used several times in this function.
	153	*/
	154	diff = interval - irqs->avg;
	155
	156	/*
	157	* Increment the number of samples.
	158	*/
	159	irqs->nr_samples++;
	160
	161	/*
	162	* Online variance divided by the number of elements if there
	163	* is more than one sample. Normally the formula is division
	164	* by nr_samples - 1 but we assume the number of element will be
	165	* more than 32 and dividing by 32 instead of 31 is enough
	166	* precise.
	167	*/
	168	if (likely(irqs->nr_samples > 1))
	169	variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
	170
	171	/*
	172	* The rule of thumb in statistics for the normal distribution
	173	* is having at least 30 samples in order to have the model to
	174	* apply. Values outside the interval are considered as an
	175	* anomaly.
	176	*/
	177	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
	178	/*
	179	* After three consecutive anomalies, we reset the
	180	* stats as it is no longer stable enough.
	181	*/
	182	if (irqs->anomalies++ >= 3) {
	183	memset(irqs, 0, sizeof(*irqs));
	184	irqs->last_ts = ts;
	185	return;
	186	}
	187	} else {
	188	/*
	189	* The anomalies must be consecutives, so at this
	190	* point, we reset the anomalies counter.
	191	*/
	192	irqs->anomalies = 0;
	193	}
	194
	195	/*
	196	* The interrupt is considered stable enough to try to predict
	197	* the next event on it.
	198	*/
	199	irqs->valid = 1;
	200
	201	/*
	202	* Online average algorithm:
	203	*
	204	* new_average = average + ((value - average) / count)
	205	*
	206	* The variance computation depends on the new average
	207	* to be computed here first.
	208	*
	209	*/
	210	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
	211
	212	/*
	213	* Online variance algorithm:
	214	*
	215	* new_variance = variance + (value - average) x (value - new_average)
	216	*
	217	* Warning: irqs->avg is updated with the line above, hence
	218	* 'interval - irqs->avg' is no longer equal to 'diff'
	219	*/
	220	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
	221
	222	/*
	223	* Update the next event
	224	*/
	225	irqs->next_evt = ts + irqs->avg;
	226	}
	227
	228	/**
	229	* irq_timings_next_event - Return when the next event is supposed to arrive
	230	*
	231	* During the last busy cycle, the number of interrupts is incremented
	232	* and stored in the irq_timings structure. This information is
	233	* necessary to:
	234	*
	235	* - know if the index in the table wrapped up:
	236	*
	237	* If more than the array size interrupts happened during the
	238	* last busy/idle cycle, the index wrapped up and we have to
	239	* begin with the next element in the array which is the last one
	240	* in the sequence, otherwise it is a the index 0.
	241	*
	242	* - have an indication of the interrupts activity on this CPU
	243	* (eg. irq/sec)
	244	*
	245	* The values are 'consumed' after inserting in the statistical model,
	246	* thus the count is reinitialized.
	247	*
	248	* The array of values must be browsed in the time direction, the
	249	* timestamp must increase between an element and the next one.
	250	*
	251	* Returns a nanosec time based estimation of the earliest interrupt,
	252	* U64_MAX otherwise.
	253	*/
	254	u64 irq_timings_next_event(u64 now)
	255	{
	256	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
	257	struct irqt_stat *irqs;
	258	struct irqt_stat __percpu *s;
	259	u64 ts, next_evt = U64_MAX;
	260	int i, irq = 0;
	261
	262	/*
	263	* This function must be called with the local irq disabled in
	264	* order to prevent the timings circular buffer to be updated
	265	* while we are reading it.
	266	*/
	267	WARN_ON_ONCE(!irqs_disabled());
	268
	269	/*
	270	* Number of elements in the circular buffer: If it happens it
	271	* was flushed before, then the number of elements could be
	272	* smaller than IRQ_TIMINGS_SIZE, so the count is used,
	273	* otherwise the array size is used as we wrapped. The index
	274	* begins from zero when we did not wrap. That could be done
	275	* in a nicer way with the proper circular array structure
	276	* type but with the cost of extra computation in the
	277	* interrupt handler hot path. We choose efficiency.
	278	*
	279	* Inject measured irq/timestamp to the statistical model
	280	* while decrementing the counter because we consume the data
	281	* from our circular buffer.
	282	*/
	283	for (i = irqts->count & IRQ_TIMINGS_MASK,
	284	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
	285	irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
	286
	287	irq = irq_timing_decode(irqts->values[i], &ts);
	288
	289	s = idr_find(&irqt_stats, irq);
	290	if (s) {
	291	irqs = this_cpu_ptr(s);
	292	irqs_update(irqs, ts);
	293	}
	294	}
	295
	296	/*
	297	* Look in the list of interrupts' statistics, the earliest
	298	* next event.
	299	*/
	300	idr_for_each_entry(&irqt_stats, s, i) {
	301
	302	irqs = this_cpu_ptr(s);
	303
	304	if (!irqs->valid)
	305	continue;
	306
	307	if (irqs->next_evt <= now) {
	308	irq = i;
	309	next_evt = now;
	310
	311	/*
	312	* This interrupt mustn't use in the future
	313	* until new events occur and update the
	314	* statistics.
	315	*/
	316	irqs->valid = 0;
	317	break;
	318	}
	319
	320	if (irqs->next_evt < next_evt) {
	321	irq = i;
	322	next_evt = irqs->next_evt;
	323	}
	324	}
	325
	326	return next_evt;
	327	}
	328
	329	void irq_timings_free(int irq)
	330	{
	331	struct irqt_stat __percpu *s;
	332
	333	s = idr_find(&irqt_stats, irq);
	334	if (s) {
	335	free_percpu(s);
	336	idr_remove(&irqt_stats, irq);
	337	}
	338	}
	339
	340	int irq_timings_alloc(int irq)
	341	{
	342	struct irqt_stat __percpu *s;
	343	int id;
	344
	345	/*
	346	* Some platforms can have the same private interrupt per cpu,
	347	* so this function may be be called several times with the
	348	* same interrupt number. Just bail out in case the per cpu
	349	* stat structure is already allocated.
	350	*/
	351	s = idr_find(&irqt_stats, irq);
	352	if (s)
	353	return 0;
	354
	355	s = alloc_percpu(*s);
	356	if (!s)
	357	return -ENOMEM;
	358
	359	idr_preload(GFP_KERNEL);
	360	id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
	361	idr_preload_end();
	362
	363	if (id < 0) {
	364	free_percpu(s);
	365	return id;
	366	}
	367
	368	return 0;
	369	}