aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2009-11-13 11:05:44 -0500
committerThomas Gleixner <tglx@linutronix.de>2009-12-10 07:08:11 -0500
commit41d2e494937715d3150e5c75d01f0e75ae899337 (patch)
tree9bc7270aa7b06ed065671a96085fbdf235977d91
parent3067e02f8f3ae2f3f02ba76400d03b8bcb4942b0 (diff)
hrtimer: Tune hrtimer_interrupt hang logic
The hrtimer_interrupt hang logic adjusts min_delta_ns based on the execution time of the hrtimer callbacks. This is error-prone for virtual machines, where a guest vcpu can be scheduled out during the execution of the callbacks (and the callbacks themselves can do operations that translate to blocking operations in the hypervisor), which in can lead to large min_delta_ns rendering the system unusable. Replace the current heuristics with something more reliable. Allow the interrupt code to try 3 times to catch up with the lost time. If that fails use the total time spent in the interrupt handler to defer the next timer interrupt so the system can catch up with other things which got delayed. Limit that deferment to 100ms. The retry events and the maximum time spent in the interrupt handler are recorded and exposed via /proc/timer_list Inspired by a patch from Marcelo. Reported-by: Michael Tokarev <mjt@tls.msk.ru> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Marcelo Tosatti <mtosatti@redhat.com> Cc: kvm@vger.kernel.org
-rw-r--r--include/linux/hrtimer.h13
-rw-r--r--kernel/hrtimer.c97
-rw-r--r--kernel/time/timer_list.c5
3 files changed, 70 insertions, 45 deletions
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 9bace4b9f4fe..040b6796ab4d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -162,10 +162,11 @@ struct hrtimer_clock_base {
162 * @expires_next: absolute time of the next event which was scheduled 162 * @expires_next: absolute time of the next event which was scheduled
163 * via clock_set_next_event() 163 * via clock_set_next_event()
164 * @hres_active: State of high resolution mode 164 * @hres_active: State of high resolution mode
165 * @check_clocks: Indictator, when set evaluate time source and clock 165 * @hang_detected: The last hrtimer interrupt detected a hang
166 * event devices whether high resolution mode can be 166 * @nr_events: Total number of hrtimer interrupt events
167 * activated. 167 * @nr_retries: Total number of hrtimer interrupt retries
168 * @nr_events: Total number of timer interrupt events 168 * @nr_hangs: Total number of hrtimer interrupt hangs
169 * @max_hang_time: Maximum time spent in hrtimer_interrupt
169 */ 170 */
170struct hrtimer_cpu_base { 171struct hrtimer_cpu_base {
171 spinlock_t lock; 172 spinlock_t lock;
@@ -173,7 +174,11 @@ struct hrtimer_cpu_base {
173#ifdef CONFIG_HIGH_RES_TIMERS 174#ifdef CONFIG_HIGH_RES_TIMERS
174 ktime_t expires_next; 175 ktime_t expires_next;
175 int hres_active; 176 int hres_active;
177 int hang_detected;
176 unsigned long nr_events; 178 unsigned long nr_events;
179 unsigned long nr_retries;
180 unsigned long nr_hangs;
181 ktime_t max_hang_time;
177#endif 182#endif
178}; 183};
179 184
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ede527708123..931a4d99bc55 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
557static int hrtimer_reprogram(struct hrtimer *timer, 557static int hrtimer_reprogram(struct hrtimer *timer,
558 struct hrtimer_clock_base *base) 558 struct hrtimer_clock_base *base)
559{ 559{
560 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 560 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
562 int res; 562 int res;
563 563
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
582 if (expires.tv64 < 0) 582 if (expires.tv64 < 0)
583 return -ETIME; 583 return -ETIME;
584 584
585 if (expires.tv64 >= expires_next->tv64) 585 if (expires.tv64 >= cpu_base->expires_next.tv64)
586 return 0;
587
588 /*
589 * If a hang was detected in the last timer interrupt then we
590 * do not schedule a timer which is earlier than the expiry
591 * which we enforced in the hang detection. We want the system
592 * to make progress.
593 */
594 if (cpu_base->hang_detected)
586 return 0; 595 return 0;
587 596
588 /* 597 /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
590 */ 599 */
591 res = tick_program_event(expires, 0); 600 res = tick_program_event(expires, 0);
592 if (!IS_ERR_VALUE(res)) 601 if (!IS_ERR_VALUE(res))
593 *expires_next = expires; 602 cpu_base->expires_next = expires;
594 return res; 603 return res;
595} 604}
596 605
@@ -1217,30 +1226,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1217 1226
1218#ifdef CONFIG_HIGH_RES_TIMERS 1227#ifdef CONFIG_HIGH_RES_TIMERS
1219 1228
1220static int force_clock_reprogram;
1221
1222/*
1223 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1224 * is hanging, which could happen with something that slows the interrupt
1225 * such as the tracing. Then we force the clock reprogramming for each future
1226 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1227 * threshold that we will overwrite.
1228 * The next tick event will be scheduled to 3 times we currently spend on
1229 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1230 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1231 * let it running without serious starvation.
1232 */
1233
1234static inline void
1235hrtimer_interrupt_hanging(struct clock_event_device *dev,
1236 ktime_t try_time)
1237{
1238 force_clock_reprogram = 1;
1239 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1240 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1241 "forcing clock min delta to %llu ns\n",
1242 (unsigned long long) dev->min_delta_ns);
1243}
1244/* 1229/*
1245 * High resolution timer interrupt 1230 * High resolution timer interrupt
1246 * Called with interrupts disabled 1231 * Called with interrupts disabled
@@ -1249,21 +1234,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1249{ 1234{
1250 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1235 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1251 struct hrtimer_clock_base *base; 1236 struct hrtimer_clock_base *base;
1252 ktime_t expires_next, now; 1237 ktime_t expires_next, now, entry_time, delta;
1253 int nr_retries = 0; 1238 int i, retries = 0;
1254 int i;
1255 1239
1256 BUG_ON(!cpu_base->hres_active); 1240 BUG_ON(!cpu_base->hres_active);
1257 cpu_base->nr_events++; 1241 cpu_base->nr_events++;
1258 dev->next_event.tv64 = KTIME_MAX; 1242 dev->next_event.tv64 = KTIME_MAX;
1259 1243
1260 retry: 1244 entry_time = now = ktime_get();
1261 /* 5 retries is enough to notice a hang */ 1245retry:
1262 if (!(++nr_retries % 5))
1263 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1264
1265 now = ktime_get();
1266
1267 expires_next.tv64 = KTIME_MAX; 1246 expires_next.tv64 = KTIME_MAX;
1268 1247
1269 spin_lock(&cpu_base->lock); 1248 spin_lock(&cpu_base->lock);
@@ -1325,10 +1304,48 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1325 spin_unlock(&cpu_base->lock); 1304 spin_unlock(&cpu_base->lock);
1326 1305
1327 /* Reprogramming necessary ? */ 1306 /* Reprogramming necessary ? */
1328 if (expires_next.tv64 != KTIME_MAX) { 1307 if (expires_next.tv64 == KTIME_MAX ||
1329 if (tick_program_event(expires_next, force_clock_reprogram)) 1308 !tick_program_event(expires_next, 0)) {
1330 goto retry; 1309 cpu_base->hang_detected = 0;
1310 return;
1331 } 1311 }
1312
1313 /*
1314 * The next timer was already expired due to:
1315 * - tracing
1316 * - long lasting callbacks
1317 * - being scheduled away when running in a VM
1318 *
1319 * We need to prevent that we loop forever in the hrtimer
1320 * interrupt routine. We give it 3 attempts to avoid
1321 * overreacting on some spurious event.
1322 */
1323 now = ktime_get();
1324 cpu_base->nr_retries++;
1325 if (++retries < 3)
1326 goto retry;
1327 /*
1328 * Give the system a chance to do something else than looping
1329 * here. We stored the entry time, so we know exactly how long
1330 * we spent here. We schedule the next event this amount of
1331 * time away.
1332 */
1333 cpu_base->nr_hangs++;
1334 cpu_base->hang_detected = 1;
1335 delta = ktime_sub(now, entry_time);
1336 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1337 cpu_base->max_hang_time = delta;
1338 /*
1339 * Limit it to a sensible value as we enforce a longer
1340 * delay. Give the CPU at least 100ms to catch up.
1341 */
1342 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1343 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1344 else
1345 expires_next = ktime_add(now, delta);
1346 tick_program_event(expires_next, 1);
1347 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1348 ktime_to_ns(delta));
1332} 1349}
1333 1350
1334/* 1351/*
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 665c76edbf17..9d80db4747d4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -254,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
254 u64 now = ktime_to_ns(ktime_get()); 257 u64 now = ktime_to_ns(ktime_get());
255 int cpu; 258 int cpu;
256 259
257 SEQ_printf(m, "Timer List Version: v0.4\n"); 260 SEQ_printf(m, "Timer List Version: v0.5\n");
258 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
259 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
260 263