aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/irq/devres.c3
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/timings.c522
-rw-r--r--kernel/irq_work.c75
4 files changed, 409 insertions, 195 deletions
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index f808c6a97dcc..f6e5515ee077 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -220,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
220 irq_flow_handler_t handler) 220 irq_flow_handler_t handler)
221{ 221{
222 struct irq_chip_generic *gc; 222 struct irq_chip_generic *gc;
223 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
224 223
225 gc = devm_kzalloc(dev, sz, GFP_KERNEL); 224 gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
226 if (gc) 225 if (gc)
227 irq_init_generic_chip(gc, name, num_ct, 226 irq_init_generic_chip(gc, name, num_ct,
228 irq_base, reg_base, handler); 227 irq_base, reg_base, handler);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1401afa0d58a..53a081392115 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
357 desc->affinity_notify = notify; 357 desc->affinity_notify = notify;
358 raw_spin_unlock_irqrestore(&desc->lock, flags); 358 raw_spin_unlock_irqrestore(&desc->lock, flags);
359 359
360 if (old_notify) 360 if (old_notify) {
361 cancel_work_sync(&old_notify->work);
361 kref_put(&old_notify->kref, old_notify->release); 362 kref_put(&old_notify->kref, old_notify->release);
363 }
362 364
363 return 0; 365 return 0;
364} 366}
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 1e4cb63a5c82..90c735da15d0 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -9,6 +9,7 @@
9#include <linux/idr.h> 9#include <linux/idr.h>
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/math64.h> 11#include <linux/math64.h>
12#include <linux/log2.h>
12 13
13#include <trace/events/irq.h> 14#include <trace/events/irq.h>
14 15
@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
18 19
19DEFINE_PER_CPU(struct irq_timings, irq_timings); 20DEFINE_PER_CPU(struct irq_timings, irq_timings);
20 21
21struct irqt_stat {
22 u64 next_evt;
23 u64 last_ts;
24 u64 variance;
25 u32 avg;
26 u32 nr_samples;
27 int anomalies;
28 int valid;
29};
30
31static DEFINE_IDR(irqt_stats); 22static DEFINE_IDR(irqt_stats);
32 23
33void irq_timings_enable(void) 24void irq_timings_enable(void)
@@ -40,75 +31,360 @@ void irq_timings_disable(void)
40 static_branch_disable(&irq_timing_enabled); 31 static_branch_disable(&irq_timing_enabled);
41} 32}
42 33
43/** 34/*
44 * irqs_update - update the irq timing statistics with a new timestamp 35 * The main goal of this algorithm is to predict the next interrupt
36 * occurrence on the current CPU.
37 *
38 * Currently, the interrupt timings are stored in a circular array
39 * buffer every time there is an interrupt, as a tuple: the interrupt
40 * number and the associated timestamp when the event occurred <irq,
41 * timestamp>.
42 *
43 * For every interrupt occurring in a short period of time, we can
44 * measure the elapsed time between the occurrences for the same
45 * interrupt and we end up with a suite of intervals. The experience
46 * showed the interrupts are often coming following a periodic
47 * pattern.
48 *
49 * The objective of the algorithm is to find out this periodic pattern
50 * in a fastest way and use its period to predict the next irq event.
51 *
52 * When the next interrupt event is requested, we are in the situation
53 * where the interrupts are disabled and the circular buffer
54 * containing the timings is filled with the events which happened
55 * after the previous next-interrupt-event request.
56 *
57 * At this point, we read the circular buffer and we fill the irq
58 * related statistics structure. After this step, the circular array
59 * containing the timings is empty because all the values are
60 * dispatched in their corresponding buffers.
61 *
62 * Now for each interrupt, we can predict the next event by using the
63 * suffix array, log interval and exponential moving average
64 *
65 * 1. Suffix array
66 *
67 * Suffix array is an array of all the suffixes of a string. It is
68 * widely used as a data structure for compression, text search, ...
69 * For instance for the word 'banana', the suffixes will be: 'banana'
70 * 'anana' 'nana' 'ana' 'na' 'a'
71 *
72 * Usually, the suffix array is sorted but for our purpose it is
73 * not necessary and won't provide any improvement in the context of
74 * the solved problem where we clearly define the boundaries of the
75 * search by a max period and min period.
76 *
77 * The suffix array will build a suite of intervals of different
78 * length and will look for the repetition of each suite. If the suite
79 * is repeating then we have the period because it is the length of
80 * the suite whatever its position in the buffer.
81 *
82 * 2. Log interval
83 *
84 * We saw the irq timings allow to compute the interval of the
85 * occurrences for a specific interrupt. We can reasonibly assume the
86 * longer is the interval, the higher is the error for the next event
87 * and we can consider storing those interval values into an array
88 * where each slot in the array correspond to an interval at the power
89 * of 2 of the index. For example, index 12 will contain values
90 * between 2^11 and 2^12.
91 *
92 * At the end we have an array of values where at each index defines a
93 * [2^index - 1, 2 ^ index] interval values allowing to store a large
94 * number of values inside a small array.
95 *
96 * For example, if we have the value 1123, then we store it at
97 * ilog2(1123) = 10 index value.
98 *
99 * Storing those value at the specific index is done by computing an
100 * exponential moving average for this specific slot. For instance,
101 * for values 1800, 1123, 1453, ... fall under the same slot (10) and
102 * the exponential moving average is computed every time a new value
103 * is stored at this slot.
104 *
105 * 3. Exponential Moving Average
106 *
107 * The EMA is largely used to track a signal for stocks or as a low
108 * pass filter. The magic of the formula, is it is very simple and the
109 * reactivity of the average can be tuned with the factors called
110 * alpha.
111 *
112 * The higher the alphas are, the faster the average respond to the
113 * signal change. In our case, if a slot in the array is a big
114 * interval, we can have numbers with a big difference between
115 * them. The impact of those differences in the average computation
116 * can be tuned by changing the alpha value.
117 *
118 *
119 * -- The algorithm --
120 *
121 * We saw the different processing above, now let's see how they are
122 * used together.
123 *
124 * For each interrupt:
125 * For each interval:
126 * Compute the index = ilog2(interval)
127 * Compute a new_ema(buffer[index], interval)
128 * Store the index in a circular buffer
129 *
130 * Compute the suffix array of the indexes
131 *
132 * For each suffix:
133 * If the suffix is reverse-found 3 times
134 * Return suffix
135 *
136 * Return Not found
137 *
138 * However we can not have endless suffix array to be build, it won't
139 * make sense and it will add an extra overhead, so we can restrict
140 * this to a maximum suffix length of 5 and a minimum suffix length of
141 * 2. The experience showed 5 is the majority of the maximum pattern
142 * period found for different devices.
143 *
144 * The result is a pattern finding less than 1us for an interrupt.
45 * 145 *
46 * @irqs: an irqt_stat struct pointer 146 * Example based on real values:
47 * @ts: the new timestamp
48 * 147 *
49 * The statistics are computed online, in other words, the code is 148 * Example 1 : MMC write/read interrupt interval:
50 * designed to compute the statistics on a stream of values rather
51 * than doing multiple passes on the values to compute the average,
52 * then the variance. The integer division introduces a loss of
53 * precision but with an acceptable error margin regarding the results
54 * we would have with the double floating precision: we are dealing
55 * with nanosec, so big numbers, consequently the mantisse is
56 * negligeable, especially when converting the time in usec
57 * afterwards.
58 * 149 *
59 * The computation happens at idle time. When the CPU is not idle, the 150 * 223947, 1240, 1384, 1386, 1386,
60 * interrupts' timestamps are stored in the circular buffer, when the 151 * 217416, 1236, 1384, 1386, 1387,
61 * CPU goes idle and this routine is called, all the buffer's values 152 * 214719, 1241, 1386, 1387, 1384,
62 * are injected in the statistical model continuying to extend the 153 * 213696, 1234, 1384, 1386, 1388,
63 * statistics from the previous busy-idle cycle. 154 * 219904, 1240, 1385, 1389, 1385,
155 * 212240, 1240, 1386, 1386, 1386,
156 * 214415, 1236, 1384, 1386, 1387,
157 * 214276, 1234, 1384, 1388, ?
64 * 158 *
65 * The observations showed a device will trigger a burst of periodic 159 * For each element, apply ilog2(value)
66 * interrupts followed by one or two peaks of longer time, for
67 * instance when a SD card device flushes its cache, then the periodic
68 * intervals occur again. A one second inactivity period resets the
69 * stats, that gives us the certitude the statistical values won't
70 * exceed 1x10^9, thus the computation won't overflow.
71 * 160 *
72 * Basically, the purpose of the algorithm is to watch the periodic 161 * 15, 8, 8, 8, 8,
73 * interrupts and eliminate the peaks. 162 * 15, 8, 8, 8, 8,
163 * 15, 8, 8, 8, 8,
164 * 15, 8, 8, 8, 8,
165 * 15, 8, 8, 8, 8,
166 * 15, 8, 8, 8, 8,
167 * 15, 8, 8, 8, 8,
168 * 15, 8, 8, 8, ?
74 * 169 *
75 * An interrupt is considered periodically stable if the interval of 170 * Max period of 5, we take the last (max_period * 3) 15 elements as
76 * its occurences follow the normal distribution, thus the values 171 * we can be confident if the pattern repeats itself three times it is
77 * comply with: 172 * a repeating pattern.
78 * 173 *
79 * avg - 3 x stddev < value < avg + 3 x stddev 174 * 8,
175 * 15, 8, 8, 8, 8,
176 * 15, 8, 8, 8, 8,
177 * 15, 8, 8, 8, ?
80 * 178 *
81 * Which can be simplified to: 179 * Suffixes are:
82 * 180 *
83 * -3 x stddev < value - avg < 3 x stddev 181 * 1) 8, 15, 8, 8, 8 <- max period
182 * 2) 8, 15, 8, 8
183 * 3) 8, 15, 8
184 * 4) 8, 15 <- min period
84 * 185 *
85 * abs(value - avg) < 3 x stddev 186 * From there we search the repeating pattern for each suffix.
86 * 187 *
87 * In order to save a costly square root computation, we use the 188 * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
88 * variance. For the record, stddev = sqrt(variance). The equation 189 * | | | | | | | | | | | | | | |
89 * above becomes: 190 * 8, 15, 8, 8, 8 | | | | | | | | | |
191 * 8, 15, 8, 8, 8 | | | | |
192 * 8, 15, 8, 8, 8
90 * 193 *
91 * abs(value - avg) < 3 x sqrt(variance) 194 * When moving the suffix, we found exactly 3 matches.
92 * 195 *
93 * And finally we square it: 196 * The first suffix with period 5 is repeating.
94 * 197 *
95 * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 198 * The next event is (3 * max_period) % suffix_period
96 * 199 *
97 * (value - avg) x (value - avg) < 9 x variance 200 * In this example, the result 0, so the next event is suffix[0] => 8
98 * 201 *
99 * Statistically speaking, any values out of this interval is 202 * However, 8 is the index in the array of exponential moving average
100 * considered as an anomaly and is discarded. However, a normal 203 * which was calculated on the fly when storing the values, so the
101 * distribution appears when the number of samples is 30 (it is the 204 * interval is ema[8] = 1366
102 * rule of thumb in statistics, cf. "30 samples" on Internet). When
103 * there are three consecutive anomalies, the statistics are resetted.
104 * 205 *
206 *
207 * Example 2:
208 *
209 * 4, 3, 5, 100,
210 * 3, 3, 5, 117,
211 * 4, 4, 5, 112,
212 * 4, 3, 4, 110,
213 * 3, 5, 3, 117,
214 * 4, 4, 5, 112,
215 * 4, 3, 4, 110,
216 * 3, 4, 5, 112,
217 * 4, 3, 4, 110
218 *
219 * ilog2
220 *
221 * 0, 0, 0, 4,
222 * 0, 0, 0, 4,
223 * 0, 0, 0, 4,
224 * 0, 0, 0, 4,
225 * 0, 0, 0, 4,
226 * 0, 0, 0, 4,
227 * 0, 0, 0, 4,
228 * 0, 0, 0, 4,
229 * 0, 0, 0, 4
230 *
231 * Max period 5:
232 * 0, 0, 4,
233 * 0, 0, 0, 4,
234 * 0, 0, 0, 4,
235 * 0, 0, 0, 4
236 *
237 * Suffixes:
238 *
239 * 1) 0, 0, 4, 0, 0
240 * 2) 0, 0, 4, 0
241 * 3) 0, 0, 4
242 * 4) 0, 0
243 *
244 * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
245 * | | | | | | X
246 * 0, 0, 4, 0, 0, | X
247 * 0, 0
248 *
249 * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
250 * | | | | | | | | | | | | | | |
251 * 0, 0, 4, 0, | | | | | | | | | | |
252 * 0, 0, 4, 0, | | | | | | |
253 * 0, 0, 4, 0, | | |
254 * 0 0 4
255 *
256 * Pattern is found 3 times, the remaining is 1 which results from
257 * (max_period * 3) % suffix_period. This value is the index in the
258 * suffix arrays. The suffix array for a period 4 has the value 4
259 * at index 1.
260 */
261#define EMA_ALPHA_VAL 64
262#define EMA_ALPHA_SHIFT 7
263
264#define PREDICTION_PERIOD_MIN 2
265#define PREDICTION_PERIOD_MAX 5
266#define PREDICTION_FACTOR 4
267#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
268#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
269
270struct irqt_stat {
271 u64 last_ts;
272 u64 ema_time[PREDICTION_BUFFER_SIZE];
273 int timings[IRQ_TIMINGS_SIZE];
274 int circ_timings[IRQ_TIMINGS_SIZE];
275 int count;
276};
277
278/*
279 * Exponential moving average computation
105 */ 280 */
106static void irqs_update(struct irqt_stat *irqs, u64 ts) 281static u64 irq_timings_ema_new(u64 value, u64 ema_old)
282{
283 s64 diff;
284
285 if (unlikely(!ema_old))
286 return value;
287
288 diff = (value - ema_old) * EMA_ALPHA_VAL;
289 /*
290 * We can use a s64 type variable to be added with the u64
291 * ema_old variable as this one will never have its topmost
292 * bit set, it will be always smaller than 2^63 nanosec
293 * interrupt interval (292 years).
294 */
295 return ema_old + (diff >> EMA_ALPHA_SHIFT);
296}
297
298static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
299{
300 int i;
301
302 /*
303 * The buffer contains the suite of intervals, in a ilog2
304 * basis, we are looking for a repetition. We point the
305 * beginning of the search three times the length of the
306 * period beginning at the end of the buffer. We do that for
307 * each suffix.
308 */
309 for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
310
311 int *begin = &buffer[len - (i * 3)];
312 int *ptr = begin;
313
314 /*
315 * We look if the suite with period 'i' repeat
316 * itself. If it is truncated at the end, as it
317 * repeats we can use the period to find out the next
318 * element.
319 */
320 while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
321 ptr += i;
322 if (ptr >= &buffer[len])
323 return begin[((i * 3) % i)];
324 }
325 }
326
327 return -1;
328}
329
330static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
331{
332 int index, i, period_max, count, start, min = INT_MAX;
333
334 if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
335 irqs->count = irqs->last_ts = 0;
336 return U64_MAX;
337 }
338
339 /*
340 * As we want to find three times the repetition, we need a
341 * number of intervals greater or equal to three times the
342 * maximum period, otherwise we truncate the max period.
343 */
344 period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
345 PREDICTION_PERIOD_MAX : irqs->count / 3;
346
347 /*
348 * If we don't have enough irq timings for this prediction,
349 * just bail out.
350 */
351 if (period_max <= PREDICTION_PERIOD_MIN)
352 return U64_MAX;
353
354 /*
355 * 'count' will depends if the circular buffer wrapped or not
356 */
357 count = irqs->count < IRQ_TIMINGS_SIZE ?
358 irqs->count : IRQ_TIMINGS_SIZE;
359
360 start = irqs->count < IRQ_TIMINGS_SIZE ?
361 0 : (irqs->count & IRQ_TIMINGS_MASK);
362
363 /*
364 * Copy the content of the circular buffer into another buffer
365 * in order to linearize the buffer instead of dealing with
366 * wrapping indexes and shifted array which will be prone to
367 * error and extremelly difficult to debug.
368 */
369 for (i = 0; i < count; i++) {
370 int index = (start + i) & IRQ_TIMINGS_MASK;
371
372 irqs->timings[i] = irqs->circ_timings[index];
373 min = min_t(int, irqs->timings[i], min);
374 }
375
376 index = irq_timings_next_event_index(irqs->timings, count, period_max);
377 if (index < 0)
378 return irqs->last_ts + irqs->ema_time[min];
379
380 return irqs->last_ts + irqs->ema_time[index];
381}
382
383static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
107{ 384{
108 u64 old_ts = irqs->last_ts; 385 u64 old_ts = irqs->last_ts;
109 u64 variance = 0;
110 u64 interval; 386 u64 interval;
111 s64 diff; 387 int index;
112 388
113 /* 389 /*
114 * The timestamps are absolute time values, we need to compute 390 * The timestamps are absolute time values, we need to compute
@@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
135 * want as we need another timestamp to compute an interval. 411 * want as we need another timestamp to compute an interval.
136 */ 412 */
137 if (interval >= NSEC_PER_SEC) { 413 if (interval >= NSEC_PER_SEC) {
138 memset(irqs, 0, sizeof(*irqs)); 414 irqs->count = 0;
139 irqs->last_ts = ts;
140 return; 415 return;
141 } 416 }
142 417
143 /* 418 /*
144 * Pre-compute the delta with the average as the result is 419 * Get the index in the ema table for this interrupt. The
145 * used several times in this function. 420 * PREDICTION_FACTOR increase the interval size for the array
146 */ 421 * of exponential average.
147 diff = interval - irqs->avg;
148
149 /*
150 * Increment the number of samples.
151 */
152 irqs->nr_samples++;
153
154 /*
155 * Online variance divided by the number of elements if there
156 * is more than one sample. Normally the formula is division
157 * by nr_samples - 1 but we assume the number of element will be
158 * more than 32 and dividing by 32 instead of 31 is enough
159 * precise.
160 */
161 if (likely(irqs->nr_samples > 1))
162 variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
163
164 /*
165 * The rule of thumb in statistics for the normal distribution
166 * is having at least 30 samples in order to have the model to
167 * apply. Values outside the interval are considered as an
168 * anomaly.
169 */
170 if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
171 /*
172 * After three consecutive anomalies, we reset the
173 * stats as it is no longer stable enough.
174 */
175 if (irqs->anomalies++ >= 3) {
176 memset(irqs, 0, sizeof(*irqs));
177 irqs->last_ts = ts;
178 return;
179 }
180 } else {
181 /*
182 * The anomalies must be consecutives, so at this
183 * point, we reset the anomalies counter.
184 */
185 irqs->anomalies = 0;
186 }
187
188 /*
189 * The interrupt is considered stable enough to try to predict
190 * the next event on it.
191 */ 422 */
192 irqs->valid = 1; 423 index = likely(interval) ?
424 ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
193 425
194 /* 426 /*
195 * Online average algorithm: 427 * Store the index as an element of the pattern in another
196 * 428 * circular array.
197 * new_average = average + ((value - average) / count)
198 *
199 * The variance computation depends on the new average
200 * to be computed here first.
201 *
202 */ 429 */
203 irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); 430 irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
204 431
205 /* 432 irqs->ema_time[index] = irq_timings_ema_new(interval,
206 * Online variance algorithm: 433 irqs->ema_time[index]);
207 *
208 * new_variance = variance + (value - average) x (value - new_average)
209 *
210 * Warning: irqs->avg is updated with the line above, hence
211 * 'interval - irqs->avg' is no longer equal to 'diff'
212 */
213 irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
214 434
215 /* 435 irqs->count++;
216 * Update the next event
217 */
218 irqs->next_evt = ts + irqs->avg;
219} 436}
220 437
221/** 438/**
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
259 */ 476 */
260 lockdep_assert_irqs_disabled(); 477 lockdep_assert_irqs_disabled();
261 478
479 if (!irqts->count)
480 return next_evt;
481
262 /* 482 /*
263 * Number of elements in the circular buffer: If it happens it 483 * Number of elements in the circular buffer: If it happens it
264 * was flushed before, then the number of elements could be 484 * was flushed before, then the number of elements could be
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
269 * type but with the cost of extra computation in the 489 * type but with the cost of extra computation in the
270 * interrupt handler hot path. We choose efficiency. 490 * interrupt handler hot path. We choose efficiency.
271 * 491 *
272 * Inject measured irq/timestamp to the statistical model 492 * Inject measured irq/timestamp to the pattern prediction
273 * while decrementing the counter because we consume the data 493 * model while decrementing the counter because we consume the
274 * from our circular buffer. 494 * data from our circular buffer.
275 */ 495 */
276 for (i = irqts->count & IRQ_TIMINGS_MASK,
277 irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
278 irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
279 496
280 irq = irq_timing_decode(irqts->values[i], &ts); 497 i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
498 irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
281 499
500 for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
501 irq = irq_timing_decode(irqts->values[i], &ts);
282 s = idr_find(&irqt_stats, irq); 502 s = idr_find(&irqt_stats, irq);
283 if (s) { 503 if (s)
284 irqs = this_cpu_ptr(s); 504 irq_timings_store(irq, this_cpu_ptr(s), ts);
285 irqs_update(irqs, ts);
286 }
287 } 505 }
288 506
289 /* 507 /*
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)
294 512
295 irqs = this_cpu_ptr(s); 513 irqs = this_cpu_ptr(s);
296 514
297 if (!irqs->valid) 515 ts = __irq_timings_next_event(irqs, i, now);
298 continue; 516 if (ts <= now)
517 return now;
299 518
300 if (irqs->next_evt <= now) { 519 if (ts < next_evt)
301 irq = i; 520 next_evt = ts;
302 next_evt = now;
303
304 /*
305 * This interrupt mustn't use in the future
306 * until new events occur and update the
307 * statistics.
308 */
309 irqs->valid = 0;
310 break;
311 }
312
313 if (irqs->next_evt < next_evt) {
314 irq = i;
315 next_evt = irqs->next_evt;
316 }
317 } 521 }
318 522
319 return next_evt; 523 return next_evt;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 6b7cdf17ccf8..73288914ed5e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void)
56 */ 56 */
57} 57}
58 58
59/* 59/* Enqueue on current CPU, work must already be claimed and preempt disabled */
60 * Enqueue the irq_work @work on @cpu unless it's already pending 60static void __irq_work_queue_local(struct irq_work *work)
61 * somewhere.
62 *
63 * Can be re-enqueued while the callback is still in progress.
64 */
65bool irq_work_queue_on(struct irq_work *work, int cpu)
66{ 61{
67 /* All work should have been flushed before going offline */ 62 /* If the work is "lazy", handle it from next tick if any */
68 WARN_ON_ONCE(cpu_is_offline(cpu)); 63 if (work->flags & IRQ_WORK_LAZY) {
69 64 if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
70#ifdef CONFIG_SMP 65 tick_nohz_tick_stopped())
71 66 arch_irq_work_raise();
72 /* Arch remote IPI send/receive backend aren't NMI safe */ 67 } else {
73 WARN_ON_ONCE(in_nmi()); 68 if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
69 arch_irq_work_raise();
70 }
71}
74 72
73/* Enqueue the irq work @work on the current CPU */
74bool irq_work_queue(struct irq_work *work)
75{
75 /* Only queue if not already pending */ 76 /* Only queue if not already pending */
76 if (!irq_work_claim(work)) 77 if (!irq_work_claim(work))
77 return false; 78 return false;
78 79
79 if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) 80 /* Queue the entry and raise the IPI if needed. */
80 arch_send_call_function_single_ipi(cpu); 81 preempt_disable();
81 82 __irq_work_queue_local(work);
82#else /* #ifdef CONFIG_SMP */ 83 preempt_enable();
83 irq_work_queue(work);
84#endif /* #else #ifdef CONFIG_SMP */
85 84
86 return true; 85 return true;
87} 86}
87EXPORT_SYMBOL_GPL(irq_work_queue);
88 88
89/* Enqueue the irq work @work on the current CPU */ 89/*
90bool irq_work_queue(struct irq_work *work) 90 * Enqueue the irq_work @work on @cpu unless it's already pending
91 * somewhere.
92 *
93 * Can be re-enqueued while the callback is still in progress.
94 */
95bool irq_work_queue_on(struct irq_work *work, int cpu)
91{ 96{
97#ifndef CONFIG_SMP
98 return irq_work_queue(work);
99
100#else /* CONFIG_SMP: */
101 /* All work should have been flushed before going offline */
102 WARN_ON_ONCE(cpu_is_offline(cpu));
103
92 /* Only queue if not already pending */ 104 /* Only queue if not already pending */
93 if (!irq_work_claim(work)) 105 if (!irq_work_claim(work))
94 return false; 106 return false;
95 107
96 /* Queue the entry and raise the IPI if needed. */
97 preempt_disable(); 108 preempt_disable();
98 109 if (cpu != smp_processor_id()) {
99 /* If the work is "lazy", handle it from next tick if any */ 110 /* Arch remote IPI send/receive backend aren't NMI safe */
100 if (work->flags & IRQ_WORK_LAZY) { 111 WARN_ON_ONCE(in_nmi());
101 if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && 112 if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
102 tick_nohz_tick_stopped()) 113 arch_send_call_function_single_ipi(cpu);
103 arch_irq_work_raise();
104 } else { 114 } else {
105 if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) 115 __irq_work_queue_local(work);
106 arch_irq_work_raise();
107 } 116 }
108
109 preempt_enable(); 117 preempt_enable();
110 118
111 return true; 119 return true;
120#endif /* CONFIG_SMP */
112} 121}
113EXPORT_SYMBOL_GPL(irq_work_queue); 122
114 123
115bool irq_work_needs_cpu(void) 124bool irq_work_needs_cpu(void)
116{ 125{