1 files changed, 363 insertions, 159 deletions
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 1e4cb63a5c82..90c735da15d0 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -9,6 +9,7 @@
 #include <linux/idr.h>
 #include <linux/irq.h>
 #include <linux/math64.h>
+#include <linux/log2.h>
 #include <trace/events/irq.h>
@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
 DEFINE_PER_CPU(struct irq_timings, irq_timings);
-struct irqt_stat {
-        u64     next_evt;
-        u64     last_ts;
-        u64     variance;
-        u32     avg;
-        u32     nr_samples;
-        int     anomalies;
-        int     valid;
-};
 static DEFINE_IDR(irqt_stats);
 void irq_timings_enable(void)
@@ -40,75 +31,360 @@ void irq_timings_disable(void)
        static_branch_disable(&irq_timing_enabled);
 }
-/**
+/*
- * irqs_update - update the irq timing statistics with a new timestamp
+ * The main goal of this algorithm is to predict the next interrupt
+ * occurrence on the current CPU.
+ *
+ * Currently, the interrupt timings are stored in a circular array
+ * buffer every time there is an interrupt, as a tuple: the interrupt
+ * number and the associated timestamp when the event occurred <irq,
+ * timestamp>.
+ *
+ * For every interrupt occurring in a short period of time, we can
+ * measure the elapsed time between the occurrences for the same
+ * interrupt and we end up with a suite of intervals. The experience
+ * showed the interrupts are often coming following a periodic
+ * pattern.
+ *
+ * The objective of the algorithm is to find out this periodic pattern
+ * in a fastest way and use its period to predict the next irq event.
+ *
+ * When the next interrupt event is requested, we are in the situation
+ * where the interrupts are disabled and the circular buffer
+ * containing the timings is filled with the events which happened
+ * after the previous next-interrupt-event request.
+ *
+ * At this point, we read the circular buffer and we fill the irq
+ * related statistics structure. After this step, the circular array
+ * containing the timings is empty because all the values are
+ * dispatched in their corresponding buffers.
+ *
+ * Now for each interrupt, we can predict the next event by using the
+ * suffix array, log interval and exponential moving average
+ *
+ * 1. Suffix array
+ *
+ * Suffix array is an array of all the suffixes of a string. It is
+ * widely used as a data structure for compression, text search, ...
+ * For instance for the word 'banana', the suffixes will be: 'banana'
+ * 'anana' 'nana' 'ana' 'na' 'a'
+ *
+ * Usually, the suffix array is sorted but for our purpose it is
+ * not necessary and won't provide any improvement in the context of
+ * the solved problem where we clearly define the boundaries of the
+ * search by a max period and min period.
+ *
+ * The suffix array will build a suite of intervals of different
+ * length and will look for the repetition of each suite. If the suite
+ * is repeating then we have the period because it is the length of
+ * the suite whatever its position in the buffer.
+ *
+ * 2. Log interval
+ *
+ * We saw the irq timings allow to compute the interval of the
+ * occurrences for a specific interrupt. We can reasonibly assume the
+ * longer is the interval, the higher is the error for the next event
+ * and we can consider storing those interval values into an array
+ * where each slot in the array correspond to an interval at the power
+ * of 2 of the index. For example, index 12 will contain values
+ * between 2^11 and 2^12.
+ *
+ * At the end we have an array of values where at each index defines a
+ * [2^index - 1, 2 ^ index] interval values allowing to store a large
+ * number of values inside a small array.
+ *
+ * For example, if we have the value 1123, then we store it at
+ * ilog2(1123) = 10 index value.
+ *
+ * Storing those value at the specific index is done by computing an
+ * exponential moving average for this specific slot. For instance,
+ * for values 1800, 1123, 1453, ... fall under the same slot (10) and
+ * the exponential moving average is computed every time a new value
+ * is stored at this slot.
+ *
+ * 3. Exponential Moving Average
+ *
+ * The EMA is largely used to track a signal for stocks or as a low
+ * pass filter. The magic of the formula, is it is very simple and the
+ * reactivity of the average can be tuned with the factors called
+ * alpha.
+ *
+ * The higher the alphas are, the faster the average respond to the
+ * signal change. In our case, if a slot in the array is a big
+ * interval, we can have numbers with a big difference between
+ * them. The impact of those differences in the average computation
+ * can be tuned by changing the alpha value.
+ *
+ *
+ *  -- The algorithm --
+ *
+ * We saw the different processing above, now let's see how they are
+ * used together.
+ *
+ * For each interrupt:
+ *      For each interval:
+ *              Compute the index = ilog2(interval)
+ *              Compute a new_ema(buffer[index], interval)
+ *              Store the index in a circular buffer
+ *
+ *      Compute the suffix array of the indexes
+ *
+ *      For each suffix:
+ *              If the suffix is reverse-found 3 times
+ *                      Return suffix
+ *
+ *      Return Not found
+ *
+ * However we can not have endless suffix array to be build, it won't
+ * make sense and it will add an extra overhead, so we can restrict
+ * this to a maximum suffix length of 5 and a minimum suffix length of
+ * 2. The experience showed 5 is the majority of the maximum pattern
+ * period found for different devices.
+ *
+ * The result is a pattern finding less than 1us for an interrupt.
 *
- * @irqs: an irqt_stat struct pointer
+ * Example based on real values:
- * @ts: the new timestamp
 *
- * The statistics are computed online, in other words, the code is
+ * Example 1 : MMC write/read interrupt interval:
- * designed to compute the statistics on a stream of values rather
- * than doing multiple passes on the values to compute the average,
- * then the variance. The integer division introduces a loss of
- * precision but with an acceptable error margin regarding the results
- * we would have with the double floating precision: we are dealing
- * with nanosec, so big numbers, consequently the mantisse is
- * negligeable, especially when converting the time in usec
- * afterwards.
 *
- * The computation happens at idle time. When the CPU is not idle, the
+ *      223947, 1240, 1384, 1386, 1386,
- * interrupts' timestamps are stored in the circular buffer, when the
+ *      217416, 1236, 1384, 1386, 1387,
- * CPU goes idle and this routine is called, all the buffer's values
+ *      214719, 1241, 1386, 1387, 1384,
- * are injected in the statistical model continuying to extend the
+ *      213696, 1234, 1384, 1386, 1388,
- * statistics from the previous busy-idle cycle.
+ *      219904, 1240, 1385, 1389, 1385,
+ *      212240, 1240, 1386, 1386, 1386,
+ *      214415, 1236, 1384, 1386, 1387,
+ *      214276, 1234, 1384, 1388, ?
 *
- * The observations showed a device will trigger a burst of periodic
+ * For each element, apply ilog2(value)
- * interrupts followed by one or two peaks of longer time, for
- * instance when a SD card device flushes its cache, then the periodic
- * intervals occur again. A one second inactivity period resets the
- * stats, that gives us the certitude the statistical values won't
- * exceed 1x10^9, thus the computation won't overflow.
 *
- * Basically, the purpose of the algorithm is to watch the periodic
+ *      15, 8, 8, 8, 8,
- * interrupts and eliminate the peaks.
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, ?
 *
- * An interrupt is considered periodically stable if the interval of
+ * Max period of 5, we take the last (max_period * 3) 15 elements as
- * its occurences follow the normal distribution, thus the values
+ * we can be confident if the pattern repeats itself three times it is
- * comply with:
+ * a repeating pattern.
 *
- *      avg - 3 x stddev < value < avg + 3 x stddev
+ *                   8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, ?
 *
- * Which can be simplified to:
+ * Suffixes are:
 *
- *      -3 x stddev < value - avg < 3 x stddev
+ *  1) 8, 15, 8, 8, 8  <- max period
+ *  2) 8, 15, 8, 8
+ *  3) 8, 15, 8
+ *  4) 8, 15           <- min period
 *
- *      abs(value - avg) < 3 x stddev
+ * From there we search the repeating pattern for each suffix.
 *
- * In order to save a costly square root computation, we use the
+ * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
- * variance. For the record, stddev = sqrt(variance). The equation
+ *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  |
- * above becomes:
+ *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  |
+ *                         8, 15, 8, 8, 8  |   |  |  |  |
+ *                                         8, 15, 8, 8, 8
 *
- *      abs(value - avg) < 3 x sqrt(variance)
+ * When moving the suffix, we found exactly 3 matches.
 *
- * And finally we square it:
+ * The first suffix with period 5 is repeating.
 *
- *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
+ * The next event is (3 * max_period) % suffix_period
 *
- *      (value - avg) x (value - avg) < 9 x variance
+ * In this example, the result 0, so the next event is suffix[0] => 8
 *
- * Statistically speaking, any values out of this interval is
+ * However, 8 is the index in the array of exponential moving average
- * considered as an anomaly and is discarded. However, a normal
+ * which was calculated on the fly when storing the values, so the
- * distribution appears when the number of samples is 30 (it is the
+ * interval is ema[8] = 1366
- * rule of thumb in statistics, cf. "30 samples" on Internet). When
- * there are three consecutive anomalies, the statistics are resetted.
 *
+ *
+ * Example 2:
+ *
+ *      4, 3, 5, 100,
+ *      3, 3, 5, 117,
+ *      4, 4, 5, 112,
+ *      4, 3, 4, 110,
+ *      3, 5, 3, 117,
+ *      4, 4, 5, 112,
+ *      4, 3, 4, 110,
+ *      3, 4, 5, 112,
+ *      4, 3, 4, 110
+ *
+ * ilog2
+ *
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4
+ *
+ * Max period 5:
+ *         0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4
+ *
+ * Suffixes:
+ *
+ *  1) 0, 0, 4, 0, 0
+ *  2) 0, 0, 4, 0
+ *  3) 0, 0, 4
+ *  4) 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  X
+ *         0, 0, 4, 0, 0, |  X
+ *                        0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+ *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  |
+ *                     0, 0, 4, 0, |  |  |  |  |  |  |
+ *                                 0, 0, 4, 0, |  |  |
+ *                                             0  0  4
+ *
+ * Pattern is found 3 times, the remaining is 1 which results from
+ * (max_period * 3) % suffix_period. This value is the index in the
+ * suffix arrays. The suffix array for a period 4 has the value 4
+ * at index 1.
+ */
+#define EMA_ALPHA_VAL           64
+#define EMA_ALPHA_SHIFT         7
+#define PREDICTION_PERIOD_MIN   2
+#define PREDICTION_PERIOD_MAX   5
+#define PREDICTION_FACTOR       4
+#define PREDICTION_MAX          10 /* 2 ^ PREDICTION_MAX useconds */
+#define PREDICTION_BUFFER_SIZE  16 /* slots for EMAs, hardly more than 16 */
+struct irqt_stat {
+        u64     last_ts;
+        u64     ema_time[PREDICTION_BUFFER_SIZE];
+        int     timings[IRQ_TIMINGS_SIZE];
+        int     circ_timings[IRQ_TIMINGS_SIZE];
+        int     count;
+};
+/*
+ * Exponential moving average computation
 */
-static void irqs_update(struct irqt_stat *irqs, u64 ts)
+static u64 irq_timings_ema_new(u64 value, u64 ema_old)
+{
+        s64 diff;
+        if (unlikely(!ema_old))
+                return value;
+        diff = (value - ema_old) * EMA_ALPHA_VAL;
+        /*
+         * We can use a s64 type variable to be added with the u64
+         * ema_old variable as this one will never have its topmost
+         * bit set, it will be always smaller than 2^63 nanosec
+         * interrupt interval (292 years).
+         */
+        return ema_old + (diff >> EMA_ALPHA_SHIFT);
+}
+static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
+{
+        int i;
+        /*
+         * The buffer contains the suite of intervals, in a ilog2
+         * basis, we are looking for a repetition. We point the
+         * beginning of the search three times the length of the
+         * period beginning at the end of the buffer. We do that for
+         * each suffix.
+         */
+        for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+                int *begin = &buffer[len - (i * 3)];
+                int *ptr = begin;
+                /*
+                 * We look if the suite with period 'i' repeat
+                 * itself. If it is truncated at the end, as it
+                 * repeats we can use the period to find out the next
+                 * element.
+                 */
+                while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
+                        ptr += i;
+                        if (ptr >= &buffer[len])
+                                return begin[((i * 3) % i)];
+                }
+        }
+        return -1;
+}
+static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
+{
+        int index, i, period_max, count, start, min = INT_MAX;
+        if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
+                irqs->count = irqs->last_ts = 0;
+                return U64_MAX;
+        }
+        /*
+         * As we want to find three times the repetition, we need a
+         * number of intervals greater or equal to three times the
+         * maximum period, otherwise we truncate the max period.
+         */
+        period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
+                PREDICTION_PERIOD_MAX : irqs->count / 3;
+        /*
+         * If we don't have enough irq timings for this prediction,
+         * just bail out.
+         */
+        if (period_max <= PREDICTION_PERIOD_MIN)
+                return U64_MAX;
+        /*
+         * 'count' will depends if the circular buffer wrapped or not
+         */
+        count = irqs->count < IRQ_TIMINGS_SIZE ?
+                irqs->count : IRQ_TIMINGS_SIZE;
+        start = irqs->count < IRQ_TIMINGS_SIZE ?
+                0 : (irqs->count & IRQ_TIMINGS_MASK);
+        /*
+         * Copy the content of the circular buffer into another buffer
+         * in order to linearize the buffer instead of dealing with
+         * wrapping indexes and shifted array which will be prone to
+         * error and extremelly difficult to debug.
+         */
+        for (i = 0; i < count; i++) {
+                int index = (start + i) & IRQ_TIMINGS_MASK;
+                irqs->timings[i] = irqs->circ_timings[index];
+                min = min_t(int, irqs->timings[i], min);
+        }
+        index = irq_timings_next_event_index(irqs->timings, count, period_max);
+        if (index < 0)
+                return irqs->last_ts + irqs->ema_time[min];
+        return irqs->last_ts + irqs->ema_time[index];
+}
+static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
 {
        u64 old_ts = irqs->last_ts;
-        u64 variance = 0;
        u64 interval;
-        s64 diff;
+        int index;
        /*
         * The timestamps are absolute time values, we need to compute
@@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
         * want as we need another timestamp to compute an interval.
         */
        if (interval >= NSEC_PER_SEC) {
-                memset(irqs, 0, sizeof(*irqs));
+                irqs->count = 0;
-                irqs->last_ts = ts;
                return;
        }
        /*
-         * Pre-compute the delta with the average as the result is
+         * Get the index in the ema table for this interrupt. The
-         * used several times in this function.
+         * PREDICTION_FACTOR increase the interval size for the array
-         */
+         * of exponential average.
-        diff = interval - irqs->avg;
-        /*
-         * Increment the number of samples.
-         */
-        irqs->nr_samples++;
-        /*
-         * Online variance divided by the number of elements if there
-         * is more than one sample.  Normally the formula is division
-         * by nr_samples - 1 but we assume the number of element will be
-         * more than 32 and dividing by 32 instead of 31 is enough
-         * precise.
-         */
-        if (likely(irqs->nr_samples > 1))
-                variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
-        /*
-         * The rule of thumb in statistics for the normal distribution
-         * is having at least 30 samples in order to have the model to
-         * apply. Values outside the interval are considered as an
-         * anomaly.
-         */
-        if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
-                /*
-                 * After three consecutive anomalies, we reset the
-                 * stats as it is no longer stable enough.
-                 */
-                if (irqs->anomalies++ >= 3) {
-                        memset(irqs, 0, sizeof(*irqs));
-                        irqs->last_ts = ts;
-                        return;
-                }
-        } else {
-                /*
-                 * The anomalies must be consecutives, so at this
-                 * point, we reset the anomalies counter.
-                 */
-                irqs->anomalies = 0;
-        }
-        /*
-         * The interrupt is considered stable enough to try to predict
-         * the next event on it.
         */
-        irqs->valid = 1;
+        index = likely(interval) ?
+                ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
        /*
-         * Online average algorithm:
+         * Store the index as an element of the pattern in another
-         *
+         * circular array.
-         *  new_average = average + ((value - average) / count)
-         *
-         * The variance computation depends on the new average
-         * to be computed here first.
-         *
         */
-        irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
+        irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
-        /*
+        irqs->ema_time[index] = irq_timings_ema_new(interval,
-         * Online variance algorithm:
+                                                    irqs->ema_time[index]);
-         *
-         *  new_variance = variance + (value - average) x (value - new_average)
-         *
-         * Warning: irqs->avg is updated with the line above, hence
-         * 'interval - irqs->avg' is no longer equal to 'diff'
-         */
-        irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
-        /*
+        irqs->count++;
-         * Update the next event
-         */
-        irqs->next_evt = ts + irqs->avg;
 }
 /**
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
         */
        lockdep_assert_irqs_disabled();
+        if (!irqts->count)
+                return next_evt;
        /*
         * Number of elements in the circular buffer: If it happens it
         * was flushed before, then the number of elements could be
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
         * type but with the cost of extra computation in the
         * interrupt handler hot path. We choose efficiency.
         *
-         * Inject measured irq/timestamp to the statistical model
+         * Inject measured irq/timestamp to the pattern prediction
-         * while decrementing the counter because we consume the data
+         * model while decrementing the counter because we consume the
-         * from our circular buffer.
+         * data from our circular buffer.
         */
-        for (i = irqts->count & IRQ_TIMINGS_MASK,
-                     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-             irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
-                irq = irq_timing_decode(irqts->values[i], &ts);
+        i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
+        irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
+        for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+                irq = irq_timing_decode(irqts->values[i], &ts);
                s = idr_find(&irqt_stats, irq);
-                if (s) {
+                if (s)
-                        irqs = this_cpu_ptr(s);
+                        irq_timings_store(irq, this_cpu_ptr(s), ts);
-                        irqs_update(irqs, ts);
-                }
        }
        /*
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)
                irqs = this_cpu_ptr(s);
-                if (!irqs->valid)
+                ts = __irq_timings_next_event(irqs, i, now);
-                        continue;
+                if (ts <= now)
+                        return now;
-                if (irqs->next_evt <= now) {
+                if (ts < next_evt)
-                        irq = i;
+                        next_evt = ts;
-                        next_evt = now;
-                        /*
-                         * This interrupt mustn't use in the future
-                         * until new events occur and update the
-                         * statistics.
-                         */
-                        irqs->valid = 0;
-                        break;
-                }
-                if (irqs->next_evt < next_evt) {
-                        irq = i;
-                        next_evt = irqs->next_evt;
-                }
        }
        return next_evt;

diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 1e4cb63a5c82..90c735da15d0 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c
@@ -9,6 +9,7 @@
9	#include <linux/idr.h>	9	#include <linux/idr.h>
10	#include <linux/irq.h>	10	#include <linux/irq.h>
11	#include <linux/math64.h>	11	#include <linux/math64.h>
		12	#include <linux/log2.h>
12		13
13	#include <trace/events/irq.h>	14	#include <trace/events/irq.h>
14		15
@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
18		19
19	DEFINE_PER_CPU(struct irq_timings, irq_timings);	20	DEFINE_PER_CPU(struct irq_timings, irq_timings);
20		21
21	struct irqt_stat {
22	u64 next_evt;
23	u64 last_ts;
24	u64 variance;
25	u32 avg;
26	u32 nr_samples;
27	int anomalies;
28	int valid;
29	};
30
31	static DEFINE_IDR(irqt_stats);	22	static DEFINE_IDR(irqt_stats);
32		23
33	void irq_timings_enable(void)	24	void irq_timings_enable(void)
@@ -40,75 +31,360 @@ void irq_timings_disable(void)
40	static_branch_disable(&irq_timing_enabled);	31	static_branch_disable(&irq_timing_enabled);
41	}	32	}
42		33
43	/**	34	/*
44	* irqs_update - update the irq timing statistics with a new timestamp	35	* The main goal of this algorithm is to predict the next interrupt
		36	* occurrence on the current CPU.
		37	*
		38	* Currently, the interrupt timings are stored in a circular array
		39	* buffer every time there is an interrupt, as a tuple: the interrupt
		40	* number and the associated timestamp when the event occurred <irq,
		41	* timestamp>.
		42	*
		43	* For every interrupt occurring in a short period of time, we can
		44	* measure the elapsed time between the occurrences for the same
		45	* interrupt and we end up with a suite of intervals. The experience
		46	* showed the interrupts are often coming following a periodic
		47	* pattern.
		48	*
		49	* The objective of the algorithm is to find out this periodic pattern
		50	* in a fastest way and use its period to predict the next irq event.
		51	*
		52	* When the next interrupt event is requested, we are in the situation
		53	* where the interrupts are disabled and the circular buffer
		54	* containing the timings is filled with the events which happened
		55	* after the previous next-interrupt-event request.
		56	*
		57	* At this point, we read the circular buffer and we fill the irq
		58	* related statistics structure. After this step, the circular array
		59	* containing the timings is empty because all the values are
		60	* dispatched in their corresponding buffers.
		61	*
		62	* Now for each interrupt, we can predict the next event by using the
		63	* suffix array, log interval and exponential moving average
		64	*
		65	* 1. Suffix array
		66	*
		67	* Suffix array is an array of all the suffixes of a string. It is
		68	* widely used as a data structure for compression, text search, ...
		69	* For instance for the word 'banana', the suffixes will be: 'banana'
		70	* 'anana' 'nana' 'ana' 'na' 'a'
		71	*
		72	* Usually, the suffix array is sorted but for our purpose it is
		73	* not necessary and won't provide any improvement in the context of
		74	* the solved problem where we clearly define the boundaries of the
		75	* search by a max period and min period.
		76	*
		77	* The suffix array will build a suite of intervals of different
		78	* length and will look for the repetition of each suite. If the suite
		79	* is repeating then we have the period because it is the length of
		80	* the suite whatever its position in the buffer.
		81	*
		82	* 2. Log interval
		83	*
		84	* We saw the irq timings allow to compute the interval of the
		85	* occurrences for a specific interrupt. We can reasonibly assume the
		86	* longer is the interval, the higher is the error for the next event
		87	* and we can consider storing those interval values into an array
		88	* where each slot in the array correspond to an interval at the power
		89	* of 2 of the index. For example, index 12 will contain values
		90	* between 2^11 and 2^12.
		91	*
		92	* At the end we have an array of values where at each index defines a
		93	* [2^index - 1, 2 ^ index] interval values allowing to store a large
		94	* number of values inside a small array.
		95	*
		96	* For example, if we have the value 1123, then we store it at
		97	* ilog2(1123) = 10 index value.
		98	*
		99	* Storing those value at the specific index is done by computing an
		100	* exponential moving average for this specific slot. For instance,
		101	* for values 1800, 1123, 1453, ... fall under the same slot (10) and
		102	* the exponential moving average is computed every time a new value
		103	* is stored at this slot.
		104	*
		105	* 3. Exponential Moving Average
		106	*
		107	* The EMA is largely used to track a signal for stocks or as a low
		108	* pass filter. The magic of the formula, is it is very simple and the
		109	* reactivity of the average can be tuned with the factors called
		110	* alpha.
		111	*
		112	* The higher the alphas are, the faster the average respond to the
		113	* signal change. In our case, if a slot in the array is a big
		114	* interval, we can have numbers with a big difference between
		115	* them. The impact of those differences in the average computation
		116	* can be tuned by changing the alpha value.
		117	*
		118	*
		119	* -- The algorithm --
		120	*
		121	* We saw the different processing above, now let's see how they are
		122	* used together.
		123	*
		124	* For each interrupt:
		125	* For each interval:
		126	* Compute the index = ilog2(interval)
		127	* Compute a new_ema(buffer[index], interval)
		128	* Store the index in a circular buffer
		129	*
		130	* Compute the suffix array of the indexes
		131	*
		132	* For each suffix:
		133	* If the suffix is reverse-found 3 times
		134	* Return suffix
		135	*
		136	* Return Not found
		137	*
		138	* However we can not have endless suffix array to be build, it won't
		139	* make sense and it will add an extra overhead, so we can restrict
		140	* this to a maximum suffix length of 5 and a minimum suffix length of
		141	* 2. The experience showed 5 is the majority of the maximum pattern
		142	* period found for different devices.
		143	*
		144	* The result is a pattern finding less than 1us for an interrupt.
45	*	145	*
46	* @irqs: an irqt_stat struct pointer	146	* Example based on real values:
47	* @ts: the new timestamp
48	*	147	*
49	* The statistics are computed online, in other words, the code is	148	* Example 1 : MMC write/read interrupt interval:
50	* designed to compute the statistics on a stream of values rather
51	* than doing multiple passes on the values to compute the average,
52	* then the variance. The integer division introduces a loss of
53	* precision but with an acceptable error margin regarding the results
54	* we would have with the double floating precision: we are dealing
55	* with nanosec, so big numbers, consequently the mantisse is
56	* negligeable, especially when converting the time in usec
57	* afterwards.
58	*	149	*
59	* The computation happens at idle time. When the CPU is not idle, the	150	* 223947, 1240, 1384, 1386, 1386,
60	* interrupts' timestamps are stored in the circular buffer, when the	151	* 217416, 1236, 1384, 1386, 1387,
61	* CPU goes idle and this routine is called, all the buffer's values	152	* 214719, 1241, 1386, 1387, 1384,
62	* are injected in the statistical model continuying to extend the	153	* 213696, 1234, 1384, 1386, 1388,
63	* statistics from the previous busy-idle cycle.	154	* 219904, 1240, 1385, 1389, 1385,
		155	* 212240, 1240, 1386, 1386, 1386,
		156	* 214415, 1236, 1384, 1386, 1387,
		157	* 214276, 1234, 1384, 1388, ?
64	*	158	*
65	* The observations showed a device will trigger a burst of periodic	159	* For each element, apply ilog2(value)
66	* interrupts followed by one or two peaks of longer time, for
67	* instance when a SD card device flushes its cache, then the periodic
68	* intervals occur again. A one second inactivity period resets the
69	* stats, that gives us the certitude the statistical values won't
70	* exceed 1x10^9, thus the computation won't overflow.
71	*	160	*
72	* Basically, the purpose of the algorithm is to watch the periodic	161	* 15, 8, 8, 8, 8,
73	* interrupts and eliminate the peaks.	162	* 15, 8, 8, 8, 8,
		163	* 15, 8, 8, 8, 8,
		164	* 15, 8, 8, 8, 8,
		165	* 15, 8, 8, 8, 8,
		166	* 15, 8, 8, 8, 8,
		167	* 15, 8, 8, 8, 8,
		168	* 15, 8, 8, 8, ?
74	*	169	*
75	* An interrupt is considered periodically stable if the interval of	170	* Max period of 5, we take the last (max_period * 3) 15 elements as
76	* its occurences follow the normal distribution, thus the values	171	* we can be confident if the pattern repeats itself three times it is
77	* comply with:	172	* a repeating pattern.
78	*	173	*
79	* avg - 3 x stddev < value < avg + 3 x stddev	174	* 8,
		175	* 15, 8, 8, 8, 8,
		176	* 15, 8, 8, 8, 8,
		177	* 15, 8, 8, 8, ?
80	*	178	*
81	* Which can be simplified to:	179	* Suffixes are:
82	*	180	*
83	* -3 x stddev < value - avg < 3 x stddev	181	* 1) 8, 15, 8, 8, 8 <- max period
		182	* 2) 8, 15, 8, 8
		183	* 3) 8, 15, 8
		184	* 4) 8, 15 <- min period
84	*	185	*
85	* abs(value - avg) < 3 x stddev	186	* From there we search the repeating pattern for each suffix.
86	*	187	*
87	* In order to save a costly square root computation, we use the	188	* buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
88	* variance. For the record, stddev = sqrt(variance). The equation	189	* \| \| \| \| \| \| \| \| \| \| \| \| \| \| \|
89	* above becomes:	190	* 8, 15, 8, 8, 8 \| \| \| \| \| \| \| \| \| \|
		191	* 8, 15, 8, 8, 8 \| \| \| \| \|
		192	* 8, 15, 8, 8, 8
90	*	193	*
91	* abs(value - avg) < 3 x sqrt(variance)	194	* When moving the suffix, we found exactly 3 matches.
92	*	195	*
93	* And finally we square it:	196	* The first suffix with period 5 is repeating.
94	*	197	*
95	* (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2	198	* The next event is (3 * max_period) % suffix_period
96	*	199	*
97	* (value - avg) x (value - avg) < 9 x variance	200	* In this example, the result 0, so the next event is suffix[0] => 8
98	*	201	*
99	* Statistically speaking, any values out of this interval is	202	* However, 8 is the index in the array of exponential moving average
100	* considered as an anomaly and is discarded. However, a normal	203	* which was calculated on the fly when storing the values, so the
101	* distribution appears when the number of samples is 30 (it is the	204	* interval is ema[8] = 1366
102	* rule of thumb in statistics, cf. "30 samples" on Internet). When
103	* there are three consecutive anomalies, the statistics are resetted.
104	*	205	*
		206	*
		207	* Example 2:
		208	*
		209	* 4, 3, 5, 100,
		210	* 3, 3, 5, 117,
		211	* 4, 4, 5, 112,
		212	* 4, 3, 4, 110,
		213	* 3, 5, 3, 117,
		214	* 4, 4, 5, 112,
		215	* 4, 3, 4, 110,
		216	* 3, 4, 5, 112,
		217	* 4, 3, 4, 110
		218	*
		219	* ilog2
		220	*
		221	* 0, 0, 0, 4,
		222	* 0, 0, 0, 4,
		223	* 0, 0, 0, 4,
		224	* 0, 0, 0, 4,
		225	* 0, 0, 0, 4,
		226	* 0, 0, 0, 4,
		227	* 0, 0, 0, 4,
		228	* 0, 0, 0, 4,
		229	* 0, 0, 0, 4
		230	*
		231	* Max period 5:
		232	* 0, 0, 4,
		233	* 0, 0, 0, 4,
		234	* 0, 0, 0, 4,
		235	* 0, 0, 0, 4
		236	*
		237	* Suffixes:
		238	*
		239	* 1) 0, 0, 4, 0, 0
		240	* 2) 0, 0, 4, 0
		241	* 3) 0, 0, 4
		242	* 4) 0, 0
		243	*
		244	* buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
		245	* \| \| \| \| \| \| X
		246	* 0, 0, 4, 0, 0, \| X
		247	* 0, 0
		248	*
		249	* buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
		250	* \| \| \| \| \| \| \| \| \| \| \| \| \| \| \|
		251	* 0, 0, 4, 0, \| \| \| \| \| \| \| \| \| \| \|
		252	* 0, 0, 4, 0, \| \| \| \| \| \| \|
		253	* 0, 0, 4, 0, \| \| \|
		254	* 0 0 4
		255	*
		256	* Pattern is found 3 times, the remaining is 1 which results from
		257	* (max_period * 3) % suffix_period. This value is the index in the
		258	* suffix arrays. The suffix array for a period 4 has the value 4
		259	* at index 1.
		260	*/
		261	#define EMA_ALPHA_VAL 64
		262	#define EMA_ALPHA_SHIFT 7
		263
		264	#define PREDICTION_PERIOD_MIN 2
		265	#define PREDICTION_PERIOD_MAX 5
		266	#define PREDICTION_FACTOR 4
		267	#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
		268	#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
		269
		270	struct irqt_stat {
		271	u64 last_ts;
		272	u64 ema_time[PREDICTION_BUFFER_SIZE];
		273	int timings[IRQ_TIMINGS_SIZE];
		274	int circ_timings[IRQ_TIMINGS_SIZE];
		275	int count;
		276	};
		277
		278	/*
		279	* Exponential moving average computation
105	*/	280	*/
106	static void irqs_update(struct irqt_stat *irqs, u64 ts)	281	static u64 irq_timings_ema_new(u64 value, u64 ema_old)
		282	{
		283	s64 diff;
		284
		285	if (unlikely(!ema_old))
		286	return value;
		287
		288	diff = (value - ema_old) * EMA_ALPHA_VAL;
		289	/*
		290	* We can use a s64 type variable to be added with the u64
		291	* ema_old variable as this one will never have its topmost
		292	* bit set, it will be always smaller than 2^63 nanosec
		293	* interrupt interval (292 years).
		294	*/
		295	return ema_old + (diff >> EMA_ALPHA_SHIFT);
		296	}
		297
		298	static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
		299	{
		300	int i;
		301
		302	/*
		303	* The buffer contains the suite of intervals, in a ilog2
		304	* basis, we are looking for a repetition. We point the
		305	* beginning of the search three times the length of the
		306	* period beginning at the end of the buffer. We do that for
		307	* each suffix.
		308	*/
		309	for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
		310
		311	int begin = &buffer[len - (i 3)];
		312	int *ptr = begin;
		313
		314	/*
		315	* We look if the suite with period 'i' repeat
		316	* itself. If it is truncated at the end, as it
		317	* repeats we can use the period to find out the next
		318	* element.
		319	*/
		320	while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
		321	ptr += i;
		322	if (ptr >= &buffer[len])
		323	return begin[((i * 3) % i)];
		324	}
		325	}
		326
		327	return -1;
		328	}
		329
		330	static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
		331	{
		332	int index, i, period_max, count, start, min = INT_MAX;
		333
		334	if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
		335	irqs->count = irqs->last_ts = 0;
		336	return U64_MAX;
		337	}
		338
		339	/*
		340	* As we want to find three times the repetition, we need a
		341	* number of intervals greater or equal to three times the
		342	* maximum period, otherwise we truncate the max period.
		343	*/
		344	period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
		345	PREDICTION_PERIOD_MAX : irqs->count / 3;
		346
		347	/*
		348	* If we don't have enough irq timings for this prediction,
		349	* just bail out.
		350	*/
		351	if (period_max <= PREDICTION_PERIOD_MIN)
		352	return U64_MAX;
		353
		354	/*
		355	* 'count' will depends if the circular buffer wrapped or not
		356	*/
		357	count = irqs->count < IRQ_TIMINGS_SIZE ?
		358	irqs->count : IRQ_TIMINGS_SIZE;
		359
		360	start = irqs->count < IRQ_TIMINGS_SIZE ?
		361	0 : (irqs->count & IRQ_TIMINGS_MASK);
		362
		363	/*
		364	* Copy the content of the circular buffer into another buffer
		365	* in order to linearize the buffer instead of dealing with
		366	* wrapping indexes and shifted array which will be prone to
		367	* error and extremelly difficult to debug.
		368	*/
		369	for (i = 0; i < count; i++) {
		370	int index = (start + i) & IRQ_TIMINGS_MASK;
		371
		372	irqs->timings[i] = irqs->circ_timings[index];
		373	min = min_t(int, irqs->timings[i], min);
		374	}
		375
		376	index = irq_timings_next_event_index(irqs->timings, count, period_max);
		377	if (index < 0)
		378	return irqs->last_ts + irqs->ema_time[min];
		379
		380	return irqs->last_ts + irqs->ema_time[index];
		381	}
		382
		383	static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
107	{	384	{
108	u64 old_ts = irqs->last_ts;	385	u64 old_ts = irqs->last_ts;
109	u64 variance = 0;
110	u64 interval;	386	u64 interval;
111	s64 diff;	387	int index;
112		388
113	/*	389	/*
114	* The timestamps are absolute time values, we need to compute	390	* The timestamps are absolute time values, we need to compute
@@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
135	* want as we need another timestamp to compute an interval.	411	* want as we need another timestamp to compute an interval.
136	*/	412	*/
137	if (interval >= NSEC_PER_SEC) {	413	if (interval >= NSEC_PER_SEC) {
138	memset(irqs, 0, sizeof(*irqs));	414	irqs->count = 0;
139	irqs->last_ts = ts;
140	return;	415	return;
141	}	416	}
142		417
143	/*	418	/*
144	* Pre-compute the delta with the average as the result is	419	* Get the index in the ema table for this interrupt. The
145	* used several times in this function.	420	* PREDICTION_FACTOR increase the interval size for the array
146	*/	421	* of exponential average.
147	diff = interval - irqs->avg;
148
149	/*
150	* Increment the number of samples.
151	*/
152	irqs->nr_samples++;
153
154	/*
155	* Online variance divided by the number of elements if there
156	* is more than one sample. Normally the formula is division
157	* by nr_samples - 1 but we assume the number of element will be
158	* more than 32 and dividing by 32 instead of 31 is enough
159	* precise.
160	*/
161	if (likely(irqs->nr_samples > 1))
162	variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
163
164	/*
165	* The rule of thumb in statistics for the normal distribution
166	* is having at least 30 samples in order to have the model to
167	* apply. Values outside the interval are considered as an
168	* anomaly.
169	*/
170	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
171	/*
172	* After three consecutive anomalies, we reset the
173	* stats as it is no longer stable enough.
174	*/
175	if (irqs->anomalies++ >= 3) {
176	memset(irqs, 0, sizeof(*irqs));
177	irqs->last_ts = ts;
178	return;
179	}
180	} else {
181	/*
182	* The anomalies must be consecutives, so at this
183	* point, we reset the anomalies counter.
184	*/
185	irqs->anomalies = 0;
186	}
187
188	/*
189	* The interrupt is considered stable enough to try to predict
190	* the next event on it.
191	*/	422	*/
192	irqs->valid = 1;	423	index = likely(interval) ?
		424	ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
193		425
194	/*	426	/*
195	* Online average algorithm:	427	* Store the index as an element of the pattern in another
196	*	428	* circular array.
197	* new_average = average + ((value - average) / count)
198	*
199	* The variance computation depends on the new average
200	* to be computed here first.
201	*
202	*/	429	*/
203	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);	430	irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
204		431
205	/*	432	irqs->ema_time[index] = irq_timings_ema_new(interval,
206	* Online variance algorithm:	433	irqs->ema_time[index]);
207	*
208	* new_variance = variance + (value - average) x (value - new_average)
209	*
210	* Warning: irqs->avg is updated with the line above, hence
211	* 'interval - irqs->avg' is no longer equal to 'diff'
212	*/
213	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
214		434
215	/*	435	irqs->count++;
216	* Update the next event
217	*/
218	irqs->next_evt = ts + irqs->avg;
219	}	436	}
220		437
221	/**	438	/**
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
259	*/	476	*/
260	lockdep_assert_irqs_disabled();	477	lockdep_assert_irqs_disabled();
261		478
		479	if (!irqts->count)
		480	return next_evt;
		481
262	/*	482	/*
263	* Number of elements in the circular buffer: If it happens it	483	* Number of elements in the circular buffer: If it happens it
264	* was flushed before, then the number of elements could be	484	* was flushed before, then the number of elements could be
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
269	* type but with the cost of extra computation in the	489	* type but with the cost of extra computation in the
270	* interrupt handler hot path. We choose efficiency.	490	* interrupt handler hot path. We choose efficiency.
271	*	491	*
272	* Inject measured irq/timestamp to the statistical model	492	* Inject measured irq/timestamp to the pattern prediction
273	* while decrementing the counter because we consume the data	493	* model while decrementing the counter because we consume the
274	* from our circular buffer.	494	* data from our circular buffer.
275	*/	495	*/
276	for (i = irqts->count & IRQ_TIMINGS_MASK,
277	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
278	irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
279		496
280	irq = irq_timing_decode(irqts->values[i], &ts);	497	i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
		498	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
281		499
		500	for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
		501	irq = irq_timing_decode(irqts->values[i], &ts);
282	s = idr_find(&irqt_stats, irq);	502	s = idr_find(&irqt_stats, irq);
283	if (s) {	503	if (s)
284	irqs = this_cpu_ptr(s);	504	irq_timings_store(irq, this_cpu_ptr(s), ts);
285	irqs_update(irqs, ts);
286	}
287	}	505	}
288		506
289	/*	507	/*
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)
294		512
295	irqs = this_cpu_ptr(s);	513	irqs = this_cpu_ptr(s);
296		514
297	if (!irqs->valid)	515	ts = __irq_timings_next_event(irqs, i, now);
298	continue;	516	if (ts <= now)
		517	return now;
299		518
300	if (irqs->next_evt <= now) {	519	if (ts < next_evt)
301	irq = i;	520	next_evt = ts;
302	next_evt = now;
303
304	/*
305	* This interrupt mustn't use in the future
306	* until new events occur and update the
307	* statistics.
308	*/
309	irqs->valid = 0;
310	break;
311	}
312
313	if (irqs->next_evt < next_evt) {
314	irq = i;
315	next_evt = irqs->next_evt;
316	}
317	}	521	}
318		522
319	return next_evt;	523	return next_evt;