diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2009-11-13 11:05:44 -0500 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2009-12-10 07:08:11 -0500 |
commit | 41d2e494937715d3150e5c75d01f0e75ae899337 (patch) | |
tree | 9bc7270aa7b06ed065671a96085fbdf235977d91 | |
parent | 3067e02f8f3ae2f3f02ba76400d03b8bcb4942b0 (diff) |
hrtimer: Tune hrtimer_interrupt hang logic
The hrtimer_interrupt hang logic adjusts min_delta_ns based on the
execution time of the hrtimer callbacks.
This is error-prone for virtual machines, where a guest vcpu can be
scheduled out during the execution of the callbacks (and the callbacks
themselves can do operations that translate to blocking operations in
the hypervisor), which in can lead to large min_delta_ns rendering the
system unusable.
Replace the current heuristics with something more reliable. Allow the
interrupt code to try 3 times to catch up with the lost time. If that
fails use the total time spent in the interrupt handler to defer the
next timer interrupt so the system can catch up with other things
which got delayed. Limit that deferment to 100ms.
The retry events and the maximum time spent in the interrupt handler
are recorded and exposed via /proc/timer_list
Inspired by a patch from Marcelo.
Reported-by: Michael Tokarev <mjt@tls.msk.ru>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: kvm@vger.kernel.org
-rw-r--r-- | include/linux/hrtimer.h | 13 | ||||
-rw-r--r-- | kernel/hrtimer.c | 97 | ||||
-rw-r--r-- | kernel/time/timer_list.c | 5 |
3 files changed, 70 insertions, 45 deletions
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 9bace4b9f4fe..040b6796ab4d 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h | |||
@@ -162,10 +162,11 @@ struct hrtimer_clock_base { | |||
162 | * @expires_next: absolute time of the next event which was scheduled | 162 | * @expires_next: absolute time of the next event which was scheduled |
163 | * via clock_set_next_event() | 163 | * via clock_set_next_event() |
164 | * @hres_active: State of high resolution mode | 164 | * @hres_active: State of high resolution mode |
165 | * @check_clocks: Indictator, when set evaluate time source and clock | 165 | * @hang_detected: The last hrtimer interrupt detected a hang |
166 | * event devices whether high resolution mode can be | 166 | * @nr_events: Total number of hrtimer interrupt events |
167 | * activated. | 167 | * @nr_retries: Total number of hrtimer interrupt retries |
168 | * @nr_events: Total number of timer interrupt events | 168 | * @nr_hangs: Total number of hrtimer interrupt hangs |
169 | * @max_hang_time: Maximum time spent in hrtimer_interrupt | ||
169 | */ | 170 | */ |
170 | struct hrtimer_cpu_base { | 171 | struct hrtimer_cpu_base { |
171 | spinlock_t lock; | 172 | spinlock_t lock; |
@@ -173,7 +174,11 @@ struct hrtimer_cpu_base { | |||
173 | #ifdef CONFIG_HIGH_RES_TIMERS | 174 | #ifdef CONFIG_HIGH_RES_TIMERS |
174 | ktime_t expires_next; | 175 | ktime_t expires_next; |
175 | int hres_active; | 176 | int hres_active; |
177 | int hang_detected; | ||
176 | unsigned long nr_events; | 178 | unsigned long nr_events; |
179 | unsigned long nr_retries; | ||
180 | unsigned long nr_hangs; | ||
181 | ktime_t max_hang_time; | ||
177 | #endif | 182 | #endif |
178 | }; | 183 | }; |
179 | 184 | ||
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ede527708123..931a4d99bc55 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
557 | static int hrtimer_reprogram(struct hrtimer *timer, | 557 | static int hrtimer_reprogram(struct hrtimer *timer, |
558 | struct hrtimer_clock_base *base) | 558 | struct hrtimer_clock_base *base) |
559 | { | 559 | { |
560 | ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; | 560 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
561 | ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | 561 | ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); |
562 | int res; | 562 | int res; |
563 | 563 | ||
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
582 | if (expires.tv64 < 0) | 582 | if (expires.tv64 < 0) |
583 | return -ETIME; | 583 | return -ETIME; |
584 | 584 | ||
585 | if (expires.tv64 >= expires_next->tv64) | 585 | if (expires.tv64 >= cpu_base->expires_next.tv64) |
586 | return 0; | ||
587 | |||
588 | /* | ||
589 | * If a hang was detected in the last timer interrupt then we | ||
590 | * do not schedule a timer which is earlier than the expiry | ||
591 | * which we enforced in the hang detection. We want the system | ||
592 | * to make progress. | ||
593 | */ | ||
594 | if (cpu_base->hang_detected) | ||
586 | return 0; | 595 | return 0; |
587 | 596 | ||
588 | /* | 597 | /* |
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
590 | */ | 599 | */ |
591 | res = tick_program_event(expires, 0); | 600 | res = tick_program_event(expires, 0); |
592 | if (!IS_ERR_VALUE(res)) | 601 | if (!IS_ERR_VALUE(res)) |
593 | *expires_next = expires; | 602 | cpu_base->expires_next = expires; |
594 | return res; | 603 | return res; |
595 | } | 604 | } |
596 | 605 | ||
@@ -1217,30 +1226,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
1217 | 1226 | ||
1218 | #ifdef CONFIG_HIGH_RES_TIMERS | 1227 | #ifdef CONFIG_HIGH_RES_TIMERS |
1219 | 1228 | ||
1220 | static int force_clock_reprogram; | ||
1221 | |||
1222 | /* | ||
1223 | * After 5 iteration's attempts, we consider that hrtimer_interrupt() | ||
1224 | * is hanging, which could happen with something that slows the interrupt | ||
1225 | * such as the tracing. Then we force the clock reprogramming for each future | ||
1226 | * hrtimer interrupts to avoid infinite loops and use the min_delta_ns | ||
1227 | * threshold that we will overwrite. | ||
1228 | * The next tick event will be scheduled to 3 times we currently spend on | ||
1229 | * hrtimer_interrupt(). This gives a good compromise, the cpus will spend | ||
1230 | * 1/4 of their time to process the hrtimer interrupts. This is enough to | ||
1231 | * let it running without serious starvation. | ||
1232 | */ | ||
1233 | |||
1234 | static inline void | ||
1235 | hrtimer_interrupt_hanging(struct clock_event_device *dev, | ||
1236 | ktime_t try_time) | ||
1237 | { | ||
1238 | force_clock_reprogram = 1; | ||
1239 | dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; | ||
1240 | printk(KERN_WARNING "hrtimer: interrupt too slow, " | ||
1241 | "forcing clock min delta to %llu ns\n", | ||
1242 | (unsigned long long) dev->min_delta_ns); | ||
1243 | } | ||
1244 | /* | 1229 | /* |
1245 | * High resolution timer interrupt | 1230 | * High resolution timer interrupt |
1246 | * Called with interrupts disabled | 1231 | * Called with interrupts disabled |
@@ -1249,21 +1234,15 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1249 | { | 1234 | { |
1250 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1235 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
1251 | struct hrtimer_clock_base *base; | 1236 | struct hrtimer_clock_base *base; |
1252 | ktime_t expires_next, now; | 1237 | ktime_t expires_next, now, entry_time, delta; |
1253 | int nr_retries = 0; | 1238 | int i, retries = 0; |
1254 | int i; | ||
1255 | 1239 | ||
1256 | BUG_ON(!cpu_base->hres_active); | 1240 | BUG_ON(!cpu_base->hres_active); |
1257 | cpu_base->nr_events++; | 1241 | cpu_base->nr_events++; |
1258 | dev->next_event.tv64 = KTIME_MAX; | 1242 | dev->next_event.tv64 = KTIME_MAX; |
1259 | 1243 | ||
1260 | retry: | 1244 | entry_time = now = ktime_get(); |
1261 | /* 5 retries is enough to notice a hang */ | 1245 | retry: |
1262 | if (!(++nr_retries % 5)) | ||
1263 | hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); | ||
1264 | |||
1265 | now = ktime_get(); | ||
1266 | |||
1267 | expires_next.tv64 = KTIME_MAX; | 1246 | expires_next.tv64 = KTIME_MAX; |
1268 | 1247 | ||
1269 | spin_lock(&cpu_base->lock); | 1248 | spin_lock(&cpu_base->lock); |
@@ -1325,10 +1304,48 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1325 | spin_unlock(&cpu_base->lock); | 1304 | spin_unlock(&cpu_base->lock); |
1326 | 1305 | ||
1327 | /* Reprogramming necessary ? */ | 1306 | /* Reprogramming necessary ? */ |
1328 | if (expires_next.tv64 != KTIME_MAX) { | 1307 | if (expires_next.tv64 == KTIME_MAX || |
1329 | if (tick_program_event(expires_next, force_clock_reprogram)) | 1308 | !tick_program_event(expires_next, 0)) { |
1330 | goto retry; | 1309 | cpu_base->hang_detected = 0; |
1310 | return; | ||
1331 | } | 1311 | } |
1312 | |||
1313 | /* | ||
1314 | * The next timer was already expired due to: | ||
1315 | * - tracing | ||
1316 | * - long lasting callbacks | ||
1317 | * - being scheduled away when running in a VM | ||
1318 | * | ||
1319 | * We need to prevent that we loop forever in the hrtimer | ||
1320 | * interrupt routine. We give it 3 attempts to avoid | ||
1321 | * overreacting on some spurious event. | ||
1322 | */ | ||
1323 | now = ktime_get(); | ||
1324 | cpu_base->nr_retries++; | ||
1325 | if (++retries < 3) | ||
1326 | goto retry; | ||
1327 | /* | ||
1328 | * Give the system a chance to do something else than looping | ||
1329 | * here. We stored the entry time, so we know exactly how long | ||
1330 | * we spent here. We schedule the next event this amount of | ||
1331 | * time away. | ||
1332 | */ | ||
1333 | cpu_base->nr_hangs++; | ||
1334 | cpu_base->hang_detected = 1; | ||
1335 | delta = ktime_sub(now, entry_time); | ||
1336 | if (delta.tv64 > cpu_base->max_hang_time.tv64) | ||
1337 | cpu_base->max_hang_time = delta; | ||
1338 | /* | ||
1339 | * Limit it to a sensible value as we enforce a longer | ||
1340 | * delay. Give the CPU at least 100ms to catch up. | ||
1341 | */ | ||
1342 | if (delta.tv64 > 100 * NSEC_PER_MSEC) | ||
1343 | expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); | ||
1344 | else | ||
1345 | expires_next = ktime_add(now, delta); | ||
1346 | tick_program_event(expires_next, 1); | ||
1347 | printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", | ||
1348 | ktime_to_ns(delta)); | ||
1332 | } | 1349 | } |
1333 | 1350 | ||
1334 | /* | 1351 | /* |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 665c76edbf17..9d80db4747d4 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
150 | P_ns(expires_next); | 150 | P_ns(expires_next); |
151 | P(hres_active); | 151 | P(hres_active); |
152 | P(nr_events); | 152 | P(nr_events); |
153 | P(nr_retries); | ||
154 | P(nr_hangs); | ||
155 | P_ns(max_hang_time); | ||
153 | #endif | 156 | #endif |
154 | #undef P | 157 | #undef P |
155 | #undef P_ns | 158 | #undef P_ns |
@@ -254,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v) | |||
254 | u64 now = ktime_to_ns(ktime_get()); | 257 | u64 now = ktime_to_ns(ktime_get()); |
255 | int cpu; | 258 | int cpu; |
256 | 259 | ||
257 | SEQ_printf(m, "Timer List Version: v0.4\n"); | 260 | SEQ_printf(m, "Timer List Version: v0.5\n"); |
258 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); | 261 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); |
259 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); | 262 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); |
260 | 263 | ||