4 files changed, 409 insertions, 195 deletions
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index f808c6a97dcc..f6e5515ee077 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -220,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
                            irq_flow_handler_t handler)
 {
        struct irq_chip_generic *gc;
-        unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
-        gc = devm_kzalloc(dev, sz, GFP_KERNEL);
+        gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
        if (gc)
                irq_init_generic_chip(gc, name, num_ct,
                                      irq_base, reg_base, handler);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1401afa0d58a..53a081392115 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
        desc->affinity_notify = notify;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        if (old_notify)
+        if (old_notify) {
+                cancel_work_sync(&old_notify->work);
                kref_put(&old_notify->kref, old_notify->release);
+        }
        return 0;
 }
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 1e4cb63a5c82..90c735da15d0 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -9,6 +9,7 @@
 #include <linux/idr.h>
 #include <linux/irq.h>
 #include <linux/math64.h>
+#include <linux/log2.h>
 #include <trace/events/irq.h>
@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
 DEFINE_PER_CPU(struct irq_timings, irq_timings);
-struct irqt_stat {
-        u64     next_evt;
-        u64     last_ts;
-        u64     variance;
-        u32     avg;
-        u32     nr_samples;
-        int     anomalies;
-        int     valid;
-};
 static DEFINE_IDR(irqt_stats);
 void irq_timings_enable(void)
@@ -40,75 +31,360 @@ void irq_timings_disable(void)
        static_branch_disable(&irq_timing_enabled);
 }
-/**
+/*
- * irqs_update - update the irq timing statistics with a new timestamp
+ * The main goal of this algorithm is to predict the next interrupt
+ * occurrence on the current CPU.
+ *
+ * Currently, the interrupt timings are stored in a circular array
+ * buffer every time there is an interrupt, as a tuple: the interrupt
+ * number and the associated timestamp when the event occurred <irq,
+ * timestamp>.
+ *
+ * For every interrupt occurring in a short period of time, we can
+ * measure the elapsed time between the occurrences for the same
+ * interrupt and we end up with a suite of intervals. The experience
+ * showed the interrupts are often coming following a periodic
+ * pattern.
+ *
+ * The objective of the algorithm is to find out this periodic pattern
+ * in a fastest way and use its period to predict the next irq event.
+ *
+ * When the next interrupt event is requested, we are in the situation
+ * where the interrupts are disabled and the circular buffer
+ * containing the timings is filled with the events which happened
+ * after the previous next-interrupt-event request.
+ *
+ * At this point, we read the circular buffer and we fill the irq
+ * related statistics structure. After this step, the circular array
+ * containing the timings is empty because all the values are
+ * dispatched in their corresponding buffers.
+ *
+ * Now for each interrupt, we can predict the next event by using the
+ * suffix array, log interval and exponential moving average
+ *
+ * 1. Suffix array
+ *
+ * Suffix array is an array of all the suffixes of a string. It is
+ * widely used as a data structure for compression, text search, ...
+ * For instance for the word 'banana', the suffixes will be: 'banana'
+ * 'anana' 'nana' 'ana' 'na' 'a'
+ *
+ * Usually, the suffix array is sorted but for our purpose it is
+ * not necessary and won't provide any improvement in the context of
+ * the solved problem where we clearly define the boundaries of the
+ * search by a max period and min period.
+ *
+ * The suffix array will build a suite of intervals of different
+ * length and will look for the repetition of each suite. If the suite
+ * is repeating then we have the period because it is the length of
+ * the suite whatever its position in the buffer.
+ *
+ * 2. Log interval
+ *
+ * We saw the irq timings allow to compute the interval of the
+ * occurrences for a specific interrupt. We can reasonibly assume the
+ * longer is the interval, the higher is the error for the next event
+ * and we can consider storing those interval values into an array
+ * where each slot in the array correspond to an interval at the power
+ * of 2 of the index. For example, index 12 will contain values
+ * between 2^11 and 2^12.
+ *
+ * At the end we have an array of values where at each index defines a
+ * [2^index - 1, 2 ^ index] interval values allowing to store a large
+ * number of values inside a small array.
+ *
+ * For example, if we have the value 1123, then we store it at
+ * ilog2(1123) = 10 index value.
+ *
+ * Storing those value at the specific index is done by computing an
+ * exponential moving average for this specific slot. For instance,
+ * for values 1800, 1123, 1453, ... fall under the same slot (10) and
+ * the exponential moving average is computed every time a new value
+ * is stored at this slot.
+ *
+ * 3. Exponential Moving Average
+ *
+ * The EMA is largely used to track a signal for stocks or as a low
+ * pass filter. The magic of the formula, is it is very simple and the
+ * reactivity of the average can be tuned with the factors called
+ * alpha.
+ *
+ * The higher the alphas are, the faster the average respond to the
+ * signal change. In our case, if a slot in the array is a big
+ * interval, we can have numbers with a big difference between
+ * them. The impact of those differences in the average computation
+ * can be tuned by changing the alpha value.
+ *
+ *
+ *  -- The algorithm --
+ *
+ * We saw the different processing above, now let's see how they are
+ * used together.
+ *
+ * For each interrupt:
+ *      For each interval:
+ *              Compute the index = ilog2(interval)
+ *              Compute a new_ema(buffer[index], interval)
+ *              Store the index in a circular buffer
+ *
+ *      Compute the suffix array of the indexes
+ *
+ *      For each suffix:
+ *              If the suffix is reverse-found 3 times
+ *                      Return suffix
+ *
+ *      Return Not found
+ *
+ * However we can not have endless suffix array to be build, it won't
+ * make sense and it will add an extra overhead, so we can restrict
+ * this to a maximum suffix length of 5 and a minimum suffix length of
+ * 2. The experience showed 5 is the majority of the maximum pattern
+ * period found for different devices.
+ *
+ * The result is a pattern finding less than 1us for an interrupt.
 *
- * @irqs: an irqt_stat struct pointer
+ * Example based on real values:
- * @ts: the new timestamp
 *
- * The statistics are computed online, in other words, the code is
+ * Example 1 : MMC write/read interrupt interval:
- * designed to compute the statistics on a stream of values rather
- * than doing multiple passes on the values to compute the average,
- * then the variance. The integer division introduces a loss of
- * precision but with an acceptable error margin regarding the results
- * we would have with the double floating precision: we are dealing
- * with nanosec, so big numbers, consequently the mantisse is
- * negligeable, especially when converting the time in usec
- * afterwards.
 *
- * The computation happens at idle time. When the CPU is not idle, the
+ *      223947, 1240, 1384, 1386, 1386,
- * interrupts' timestamps are stored in the circular buffer, when the
+ *      217416, 1236, 1384, 1386, 1387,
- * CPU goes idle and this routine is called, all the buffer's values
+ *      214719, 1241, 1386, 1387, 1384,
- * are injected in the statistical model continuying to extend the
+ *      213696, 1234, 1384, 1386, 1388,
- * statistics from the previous busy-idle cycle.
+ *      219904, 1240, 1385, 1389, 1385,
+ *      212240, 1240, 1386, 1386, 1386,
+ *      214415, 1236, 1384, 1386, 1387,
+ *      214276, 1234, 1384, 1388, ?
 *
- * The observations showed a device will trigger a burst of periodic
+ * For each element, apply ilog2(value)
- * interrupts followed by one or two peaks of longer time, for
- * instance when a SD card device flushes its cache, then the periodic
- * intervals occur again. A one second inactivity period resets the
- * stats, that gives us the certitude the statistical values won't
- * exceed 1x10^9, thus the computation won't overflow.
 *
- * Basically, the purpose of the algorithm is to watch the periodic
+ *      15, 8, 8, 8, 8,
- * interrupts and eliminate the peaks.
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, ?
 *
- * An interrupt is considered periodically stable if the interval of
+ * Max period of 5, we take the last (max_period * 3) 15 elements as
- * its occurences follow the normal distribution, thus the values
+ * we can be confident if the pattern repeats itself three times it is
- * comply with:
+ * a repeating pattern.
 *
- *      avg - 3 x stddev < value < avg + 3 x stddev
+ *                   8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, ?
 *
- * Which can be simplified to:
+ * Suffixes are:
 *
- *      -3 x stddev < value - avg < 3 x stddev
+ *  1) 8, 15, 8, 8, 8  <- max period
+ *  2) 8, 15, 8, 8
+ *  3) 8, 15, 8
+ *  4) 8, 15           <- min period
 *
- *      abs(value - avg) < 3 x stddev
+ * From there we search the repeating pattern for each suffix.
 *
- * In order to save a costly square root computation, we use the
+ * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
- * variance. For the record, stddev = sqrt(variance). The equation
+ *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  |
- * above becomes:
+ *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  |
+ *                         8, 15, 8, 8, 8  |   |  |  |  |
+ *                                         8, 15, 8, 8, 8
 *
- *      abs(value - avg) < 3 x sqrt(variance)
+ * When moving the suffix, we found exactly 3 matches.
 *
- * And finally we square it:
+ * The first suffix with period 5 is repeating.
 *
- *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
+ * The next event is (3 * max_period) % suffix_period
 *
- *      (value - avg) x (value - avg) < 9 x variance
+ * In this example, the result 0, so the next event is suffix[0] => 8
 *
- * Statistically speaking, any values out of this interval is
+ * However, 8 is the index in the array of exponential moving average
- * considered as an anomaly and is discarded. However, a normal
+ * which was calculated on the fly when storing the values, so the
- * distribution appears when the number of samples is 30 (it is the
+ * interval is ema[8] = 1366
- * rule of thumb in statistics, cf. "30 samples" on Internet). When
- * there are three consecutive anomalies, the statistics are resetted.
 *
+ *
+ * Example 2:
+ *
+ *      4, 3, 5, 100,
+ *      3, 3, 5, 117,
+ *      4, 4, 5, 112,
+ *      4, 3, 4, 110,
+ *      3, 5, 3, 117,
+ *      4, 4, 5, 112,
+ *      4, 3, 4, 110,
+ *      3, 4, 5, 112,
+ *      4, 3, 4, 110
+ *
+ * ilog2
+ *
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4
+ *
+ * Max period 5:
+ *         0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4
+ *
+ * Suffixes:
+ *
+ *  1) 0, 0, 4, 0, 0
+ *  2) 0, 0, 4, 0
+ *  3) 0, 0, 4
+ *  4) 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  X
+ *         0, 0, 4, 0, 0, |  X
+ *                        0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+ *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  |
+ *                     0, 0, 4, 0, |  |  |  |  |  |  |
+ *                                 0, 0, 4, 0, |  |  |
+ *                                             0  0  4
+ *
+ * Pattern is found 3 times, the remaining is 1 which results from
+ * (max_period * 3) % suffix_period. This value is the index in the
+ * suffix arrays. The suffix array for a period 4 has the value 4
+ * at index 1.
+ */
+#define EMA_ALPHA_VAL           64
+#define EMA_ALPHA_SHIFT         7
+#define PREDICTION_PERIOD_MIN   2
+#define PREDICTION_PERIOD_MAX   5
+#define PREDICTION_FACTOR       4
+#define PREDICTION_MAX          10 /* 2 ^ PREDICTION_MAX useconds */
+#define PREDICTION_BUFFER_SIZE  16 /* slots for EMAs, hardly more than 16 */
+struct irqt_stat {
+        u64     last_ts;
+        u64     ema_time[PREDICTION_BUFFER_SIZE];
+        int     timings[IRQ_TIMINGS_SIZE];
+        int     circ_timings[IRQ_TIMINGS_SIZE];
+        int     count;
+};
+/*
+ * Exponential moving average computation
 */
-static void irqs_update(struct irqt_stat *irqs, u64 ts)
+static u64 irq_timings_ema_new(u64 value, u64 ema_old)
+{
+        s64 diff;
+        if (unlikely(!ema_old))
+                return value;
+        diff = (value - ema_old) * EMA_ALPHA_VAL;
+        /*
+         * We can use a s64 type variable to be added with the u64
+         * ema_old variable as this one will never have its topmost
+         * bit set, it will be always smaller than 2^63 nanosec
+         * interrupt interval (292 years).
+         */
+        return ema_old + (diff >> EMA_ALPHA_SHIFT);
+}
+static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
+{
+        int i;
+        /*
+         * The buffer contains the suite of intervals, in a ilog2
+         * basis, we are looking for a repetition. We point the
+         * beginning of the search three times the length of the
+         * period beginning at the end of the buffer. We do that for
+         * each suffix.
+         */
+        for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+                int *begin = &buffer[len - (i * 3)];
+                int *ptr = begin;
+                /*
+                 * We look if the suite with period 'i' repeat
+                 * itself. If it is truncated at the end, as it
+                 * repeats we can use the period to find out the next
+                 * element.
+                 */
+                while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
+                        ptr += i;
+                        if (ptr >= &buffer[len])
+                                return begin[((i * 3) % i)];
+                }
+        }
+        return -1;
+}
+static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
+{
+        int index, i, period_max, count, start, min = INT_MAX;
+        if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
+                irqs->count = irqs->last_ts = 0;
+                return U64_MAX;
+        }
+        /*
+         * As we want to find three times the repetition, we need a
+         * number of intervals greater or equal to three times the
+         * maximum period, otherwise we truncate the max period.
+         */
+        period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
+                PREDICTION_PERIOD_MAX : irqs->count / 3;
+        /*
+         * If we don't have enough irq timings for this prediction,
+         * just bail out.
+         */
+        if (period_max <= PREDICTION_PERIOD_MIN)
+                return U64_MAX;
+        /*
+         * 'count' will depends if the circular buffer wrapped or not
+         */
+        count = irqs->count < IRQ_TIMINGS_SIZE ?
+                irqs->count : IRQ_TIMINGS_SIZE;
+        start = irqs->count < IRQ_TIMINGS_SIZE ?
+                0 : (irqs->count & IRQ_TIMINGS_MASK);
+        /*
+         * Copy the content of the circular buffer into another buffer
+         * in order to linearize the buffer instead of dealing with
+         * wrapping indexes and shifted array which will be prone to
+         * error and extremelly difficult to debug.
+         */
+        for (i = 0; i < count; i++) {
+                int index = (start + i) & IRQ_TIMINGS_MASK;
+                irqs->timings[i] = irqs->circ_timings[index];
+                min = min_t(int, irqs->timings[i], min);
+        }
+        index = irq_timings_next_event_index(irqs->timings, count, period_max);
+        if (index < 0)
+                return irqs->last_ts + irqs->ema_time[min];
+        return irqs->last_ts + irqs->ema_time[index];
+}
+static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
 {
        u64 old_ts = irqs->last_ts;
-        u64 variance = 0;
        u64 interval;
-        s64 diff;
+        int index;
        /*
         * The timestamps are absolute time values, we need to compute
@@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
         * want as we need another timestamp to compute an interval.
         */
        if (interval >= NSEC_PER_SEC) {
-                memset(irqs, 0, sizeof(*irqs));
+                irqs->count = 0;
-                irqs->last_ts = ts;
                return;
        }
        /*
-         * Pre-compute the delta with the average as the result is
+         * Get the index in the ema table for this interrupt. The
-         * used several times in this function.
+         * PREDICTION_FACTOR increase the interval size for the array
-         */
+         * of exponential average.
-        diff = interval - irqs->avg;
-        /*
-         * Increment the number of samples.
-         */
-        irqs->nr_samples++;
-        /*
-         * Online variance divided by the number of elements if there
-         * is more than one sample.  Normally the formula is division
-         * by nr_samples - 1 but we assume the number of element will be
-         * more than 32 and dividing by 32 instead of 31 is enough
-         * precise.
-         */
-        if (likely(irqs->nr_samples > 1))
-                variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
-        /*
-         * The rule of thumb in statistics for the normal distribution
-         * is having at least 30 samples in order to have the model to
-         * apply. Values outside the interval are considered as an
-         * anomaly.
-         */
-        if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
-                /*
-                 * After three consecutive anomalies, we reset the
-                 * stats as it is no longer stable enough.
-                 */
-                if (irqs->anomalies++ >= 3) {
-                        memset(irqs, 0, sizeof(*irqs));
-                        irqs->last_ts = ts;
-                        return;
-                }
-        } else {
-                /*
-                 * The anomalies must be consecutives, so at this
-                 * point, we reset the anomalies counter.
-                 */
-                irqs->anomalies = 0;
-        }
-        /*
-         * The interrupt is considered stable enough to try to predict
-         * the next event on it.
         */
-        irqs->valid = 1;
+        index = likely(interval) ?
+                ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
        /*
-         * Online average algorithm:
+         * Store the index as an element of the pattern in another
-         *
+         * circular array.
-         *  new_average = average + ((value - average) / count)
-         *
-         * The variance computation depends on the new average
-         * to be computed here first.
-         *
         */
-        irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
+        irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
-        /*
+        irqs->ema_time[index] = irq_timings_ema_new(interval,
-         * Online variance algorithm:
+                                                    irqs->ema_time[index]);
-         *
-         *  new_variance = variance + (value - average) x (value - new_average)
-         *
-         * Warning: irqs->avg is updated with the line above, hence
-         * 'interval - irqs->avg' is no longer equal to 'diff'
-         */
-        irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
-        /*
+        irqs->count++;
-         * Update the next event
-         */
-        irqs->next_evt = ts + irqs->avg;
 }
 /**
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
         */
        lockdep_assert_irqs_disabled();
+        if (!irqts->count)
+                return next_evt;
        /*
         * Number of elements in the circular buffer: If it happens it
         * was flushed before, then the number of elements could be
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
         * type but with the cost of extra computation in the
         * interrupt handler hot path. We choose efficiency.
         *
-         * Inject measured irq/timestamp to the statistical model
+         * Inject measured irq/timestamp to the pattern prediction
-         * while decrementing the counter because we consume the data
+         * model while decrementing the counter because we consume the
-         * from our circular buffer.
+         * data from our circular buffer.
         */
-        for (i = irqts->count & IRQ_TIMINGS_MASK,
-                     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-             irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
-                irq = irq_timing_decode(irqts->values[i], &ts);
+        i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
+        irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
+        for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+                irq = irq_timing_decode(irqts->values[i], &ts);
                s = idr_find(&irqt_stats, irq);
-                if (s) {
+                if (s)
-                        irqs = this_cpu_ptr(s);
+                        irq_timings_store(irq, this_cpu_ptr(s), ts);
-                        irqs_update(irqs, ts);
-                }
        }
        /*
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)
                irqs = this_cpu_ptr(s);
-                if (!irqs->valid)
+                ts = __irq_timings_next_event(irqs, i, now);
-                        continue;
+                if (ts <= now)
+                        return now;
-                if (irqs->next_evt <= now) {
+                if (ts < next_evt)
-                        irq = i;
+                        next_evt = ts;
-                        next_evt = now;
-                        /*
-                         * This interrupt mustn't use in the future
-                         * until new events occur and update the
-                         * statistics.
-                         */
-                        irqs->valid = 0;
-                        break;
-                }
-                if (irqs->next_evt < next_evt) {
-                        irq = i;
-                        next_evt = irqs->next_evt;
-                }
        }
        return next_evt;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 6b7cdf17ccf8..73288914ed5e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void)
         */
 }
-/*
+/* Enqueue on current CPU, work must already be claimed and preempt disabled */
- * Enqueue the irq_work @work on @cpu unless it's already pending
+static void __irq_work_queue_local(struct irq_work *work)
- * somewhere.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue_on(struct irq_work *work, int cpu)
 {
-        /* All work should have been flushed before going offline */
+        /* If the work is "lazy", handle it from next tick if any */
-        WARN_ON_ONCE(cpu_is_offline(cpu));
+        if (work->flags & IRQ_WORK_LAZY) {
+                if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
-#ifdef CONFIG_SMP
+                    tick_nohz_tick_stopped())
+                        arch_irq_work_raise();
-        /* Arch remote IPI send/receive backend aren't NMI safe */
+        } else {
-        WARN_ON_ONCE(in_nmi());
+                if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+                        arch_irq_work_raise();
+        }
+}
+/* Enqueue the irq work @work on the current CPU */
+bool irq_work_queue(struct irq_work *work)
+{
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;
-        if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+        /* Queue the entry and raise the IPI if needed. */
-                arch_send_call_function_single_ipi(cpu);
+        preempt_disable();
+        __irq_work_queue_local(work);
-#else /* #ifdef CONFIG_SMP */
+        preempt_enable();
-        irq_work_queue(work);
-#endif /* #else #ifdef CONFIG_SMP */
        return true;
 }
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/* Enqueue the irq work @work on the current CPU */
+/*
-bool irq_work_queue(struct irq_work *work)
+ * Enqueue the irq_work @work on @cpu unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
 {
+#ifndef CONFIG_SMP
+        return irq_work_queue(work);
+#else /* CONFIG_SMP: */
+        /* All work should have been flushed before going offline */
+        WARN_ON_ONCE(cpu_is_offline(cpu));
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;
-        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
+        if (cpu != smp_processor_id()) {
-        /* If the work is "lazy", handle it from next tick if any */
+                /* Arch remote IPI send/receive backend aren't NMI safe */
-        if (work->flags & IRQ_WORK_LAZY) {
+                WARN_ON_ONCE(in_nmi());
-                if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+                if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
-                    tick_nohz_tick_stopped())
+                        arch_send_call_function_single_ipi(cpu);
-                        arch_irq_work_raise();
        } else {
-                if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+                __irq_work_queue_local(work);
-                        arch_irq_work_raise();
        }
        preempt_enable();
        return true;
+#endif /* CONFIG_SMP */
 }
-EXPORT_SYMBOL_GPL(irq_work_queue);
 bool irq_work_needs_cpu(void)
 {