aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2009-07-10 08:57:05 -0400
committerThomas Gleixner <tglx@linutronix.de>2009-07-10 11:32:55 -0400
commit6ff7041dbfeb3bd7dfe9aa67275c21199ef760d6 (patch)
treec8897de7ba6c09d51d83557d981ced3da48bba61 /kernel
parent7e0c5086c172ecf8b0c2ad860b02a586967d17d0 (diff)
hrtimer: Fix migration expiry check
The timer migration expiry check should prevent the migration of a timer to another CPU when the timer expires before the next event is scheduled on the other CPU. Migrating the timer might delay it because we can not reprogram the clock event device on the other CPU. But the code implementing that check has two flaws: - for !HIGHRES the check compares the expiry value with the clock events device expiry value which is wrong for CLOCK_REALTIME based timers. - the check is racy. It holds the hrtimer base lock of the target CPU, but the clock event device expiry value can be modified nevertheless, e.g. by an timer interrupt firing. The !HIGHRES case is easy to fix as we can enqueue the timer on the cpu which was selected by the load balancer. It runs the idle balancing code once per jiffy anyway. So the maximum delay for the timer is the same as when we keep the tick on the current cpu going. In the HIGHRES case we can get the next expiry value from the hrtimer cpu_base of the target CPU and serialize the update with the cpu_base lock. This moves the lock section in hrtimer_interrupt() so we can set next_event to KTIME_MAX while we are handling the expired timers and set it to the next expiry value after we handled the timers under the base lock. While the expired timers are processed timer migration is blocked because the expiry time of the timer is always <= KTIME_MAX. Also remove the now useless clockevents_get_next_event() function. Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/hrtimer.c121
-rw-r--r--kernel/time/clockevents.c11
2 files changed, 64 insertions, 68 deletions
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 126b9808f287..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 191 }
192} 192}
193 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
194/* 234/*
195 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
196 */ 236 */
@@ -200,27 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 240{
201 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 243 int this_cpu = smp_processor_id();
204 244 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0) {
210 /*
211 * We must not check the expiry value when
212 * preferred_cpu is the current cpu. If base
213 * != new_base we would loop forever when the
214 * timer expires before the current programmed
215 * next timer event.
216 */
217 if (preferred_cpu != cpu)
218 cpu = preferred_cpu;
219 else
220 preferred_cpu = -1;
221 }
222 }
223#endif
224 245
225again: 246again:
226 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -228,7 +249,7 @@ again:
228 249
229 if (base != new_base) { 250 if (base != new_base) {
230 /* 251 /*
231 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
232 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
233 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
234 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -244,38 +265,12 @@ again:
244 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
245 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
246 267
247 /* Optimized away for NOHZ=n SMP=n */ 268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
248 if (cpu == preferred_cpu) { 269 cpu = this_cpu;
249 /* Calculate clock monotonic expiry time */ 270 spin_unlock(&new_base->cpu_base->lock);
250#ifdef CONFIG_HIGH_RES_TIMERS 271 spin_lock(&base->cpu_base->lock);
251 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 272 timer->base = base;
252 new_base->offset); 273 goto again;
253#else
254 ktime_t expires = hrtimer_get_expires(timer);
255#endif
256
257 /*
258 * Get the next event on target cpu from the
259 * clock events layer.
260 * This covers the highres=off nohz=on case as well.
261 */
262 ktime_t next = clockevents_get_next_event(cpu);
263
264 ktime_t delta = ktime_sub(expires, next);
265
266 /*
267 * We do not migrate the timer when it is expiring
268 * before the next event on the target cpu because
269 * we cannot reprogram the target cpu hardware and
270 * we would cause it to fire late.
271 */
272 if (delta.tv64 < 0) {
273 cpu = smp_processor_id();
274 spin_unlock(&new_base->cpu_base->lock);
275 spin_lock(&base->cpu_base->lock);
276 timer->base = base;
277 goto again;
278 }
279 } 274 }
280 timer->base = new_base; 275 timer->base = new_base;
281 } 276 }
@@ -1287,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1287 1282
1288 expires_next.tv64 = KTIME_MAX; 1283 expires_next.tv64 = KTIME_MAX;
1289 1284
1285 spin_lock(&cpu_base->lock);
1286 /*
1287 * We set expires_next to KTIME_MAX here with cpu_base->lock
1288 * held to prevent that a timer is enqueued in our queue via
1289 * the migration code. This does not affect enqueueing of
1290 * timers which run their callback and need to be requeued on
1291 * this CPU.
1292 */
1293 cpu_base->expires_next.tv64 = KTIME_MAX;
1294
1290 base = cpu_base->clock_base; 1295 base = cpu_base->clock_base;
1291 1296
1292 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1293 ktime_t basenow; 1298 ktime_t basenow;
1294 struct rb_node *node; 1299 struct rb_node *node;
1295 1300
1296 spin_lock(&cpu_base->lock);
1297
1298 basenow = ktime_add(now, base->offset); 1301 basenow = ktime_add(now, base->offset);
1299 1302
1300 while ((node = base->first)) { 1303 while ((node = base->first)) {
@@ -1327,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1327 1330
1328 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1329 } 1332 }
1330 spin_unlock(&cpu_base->lock);
1331 base++; 1333 base++;
1332 } 1334 }
1333 1335
1336 /*
1337 * Store the new expiry value so the migration code can verify
1338 * against it.
1339 */
1334 cpu_base->expires_next = expires_next; 1340 cpu_base->expires_next = expires_next;
1341 spin_unlock(&cpu_base->lock);
1335 1342
1336 /* Reprogramming necessary ? */ 1343 /* Reprogramming necessary ? */
1337 if (expires_next.tv64 != KTIME_MAX) { 1344 if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..a6dcd67b041d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg)
254 spin_unlock(&clockevents_lock); 254 spin_unlock(&clockevents_lock);
255} 255}
256EXPORT_SYMBOL_GPL(clockevents_notify); 256EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 257#endif