aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Isely <isely@isely.net>2008-03-13 16:30:35 -0400
committerDave Airlie <airlied@redhat.com>2008-03-16 19:54:58 -0400
commit9df5808cca52f33e1deb52b5010c68c6ed1656fe (patch)
tree1a343571587fc978b991509cf38e2727ffc5d7b0
parenta978b30af3bab0dd9af9350eeda25e76123fa28e (diff)
drm: Fix race that can lockup the kernel
The i915_vblank_swap() function schedules an automatic buffer swap upon receipt of the vertical sync interrupt. Such an operation is lengthy so it can't be allowed to happen in normal interrupt context, thus the DRM implements this by scheduling the work in a kernel softirq-scheduled tasklet. In order for the buffer swap to work safely, the DRM's central lock must be taken, via a call to drm_lock_take() located in drivers/char/drm/drm_irq.c within the function drm_locked_tasklet_func(). The lock-taking logic uses a non-interrupt-blocking spinlock to implement the manipulations needed to take the lock. This semantic would be safe if all attempts to use the spinlock only happen from process context. However this buffer swap happens from softirq context which is really a form of interrupt context. Thus we have an unsafe situation, in that drm_locked_tasklet_func() can block on a spinlock already taken by a thread in process context which will never get scheduled again because of the blocked softirq tasklet. This wedges the kernel hard. To trigger this bug, run a dual-head cloned mode configuration which uses the i915 drm, then execute an opengl application which synchronizes buffer swaps against the vertical sync interrupt. In my testing, a lockup always results after running anywhere from 5 minutes to an hour and a half. I believe dual-head is needed to really trigger the problem because then the vertical sync interrupt handling is no longer predictable (due to being interrupt-sourced from two different heads running at different speeds). This raises the probability of the tasklet trying to run while the userspace DRI is doing things to the GPU (and manipulating the DRM lock). The fix is to change the relevant spinlock semantics to be the interrupt-blocking form. After this change I am no longer able to trigger the lockup; the longest test run so far was 20 hours (test stopped after that point). Note: I have examined the places where this spinlock is being employed; all are reasonably short bounded sequences and should be suitable for interrupts being blocked without impacting overall kernel interrupt response latency. Signed-off-by: Mike Isely <isely@pobox.com> Signed-off-by: Dave Airlie <airlied@redhat.com>
-rw-r--r--drivers/char/drm/drm_fops.c7
-rw-r--r--drivers/char/drm/drm_lock.c35
2 files changed, 25 insertions, 17 deletions
diff --git a/drivers/char/drm/drm_fops.c b/drivers/char/drm/drm_fops.c
index 3992f73299cc..f09d4b5002b0 100644
--- a/drivers/char/drm/drm_fops.c
+++ b/drivers/char/drm/drm_fops.c
@@ -326,6 +326,7 @@ int drm_release(struct inode *inode, struct file *filp)
326 struct drm_file *file_priv = filp->private_data; 326 struct drm_file *file_priv = filp->private_data;
327 struct drm_device *dev = file_priv->head->dev; 327 struct drm_device *dev = file_priv->head->dev;
328 int retcode = 0; 328 int retcode = 0;
329 unsigned long irqflags;
329 330
330 lock_kernel(); 331 lock_kernel();
331 332
@@ -357,9 +358,11 @@ int drm_release(struct inode *inode, struct file *filp)
357 */ 358 */
358 359
359 do{ 360 do{
360 spin_lock(&dev->lock.spinlock); 361 spin_lock_irqsave(&dev->lock.spinlock,
362 irqflags);
361 locked = dev->lock.idle_has_lock; 363 locked = dev->lock.idle_has_lock;
362 spin_unlock(&dev->lock.spinlock); 364 spin_unlock_irqrestore(&dev->lock.spinlock,
365 irqflags);
363 if (locked) 366 if (locked)
364 break; 367 break;
365 schedule(); 368 schedule();
diff --git a/drivers/char/drm/drm_lock.c b/drivers/char/drm/drm_lock.c
index bea2a7d5b2b2..12dcdd1832f0 100644
--- a/drivers/char/drm/drm_lock.c
+++ b/drivers/char/drm/drm_lock.c
@@ -53,6 +53,7 @@ int drm_lock(struct drm_device *dev, void *data, struct drm_file *file_priv)
53 DECLARE_WAITQUEUE(entry, current); 53 DECLARE_WAITQUEUE(entry, current);
54 struct drm_lock *lock = data; 54 struct drm_lock *lock = data;
55 int ret = 0; 55 int ret = 0;
56 unsigned long irqflags;
56 57
57 ++file_priv->lock_count; 58 ++file_priv->lock_count;
58 59
@@ -71,9 +72,9 @@ int drm_lock(struct drm_device *dev, void *data, struct drm_file *file_priv)
71 return -EINVAL; 72 return -EINVAL;
72 73
73 add_wait_queue(&dev->lock.lock_queue, &entry); 74 add_wait_queue(&dev->lock.lock_queue, &entry);
74 spin_lock(&dev->lock.spinlock); 75 spin_lock_irqsave(&dev->lock.spinlock, irqflags);
75 dev->lock.user_waiters++; 76 dev->lock.user_waiters++;
76 spin_unlock(&dev->lock.spinlock); 77 spin_unlock_irqrestore(&dev->lock.spinlock, irqflags);
77 for (;;) { 78 for (;;) {
78 __set_current_state(TASK_INTERRUPTIBLE); 79 __set_current_state(TASK_INTERRUPTIBLE);
79 if (!dev->lock.hw_lock) { 80 if (!dev->lock.hw_lock) {
@@ -95,9 +96,9 @@ int drm_lock(struct drm_device *dev, void *data, struct drm_file *file_priv)
95 break; 96 break;
96 } 97 }
97 } 98 }
98 spin_lock(&dev->lock.spinlock); 99 spin_lock_irqsave(&dev->lock.spinlock, irqflags);
99 dev->lock.user_waiters--; 100 dev->lock.user_waiters--;
100 spin_unlock(&dev->lock.spinlock); 101 spin_unlock_irqrestore(&dev->lock.spinlock, irqflags);
101 __set_current_state(TASK_RUNNING); 102 __set_current_state(TASK_RUNNING);
102 remove_wait_queue(&dev->lock.lock_queue, &entry); 103 remove_wait_queue(&dev->lock.lock_queue, &entry);
103 104
@@ -198,8 +199,9 @@ int drm_lock_take(struct drm_lock_data *lock_data,
198{ 199{
199 unsigned int old, new, prev; 200 unsigned int old, new, prev;
200 volatile unsigned int *lock = &lock_data->hw_lock->lock; 201 volatile unsigned int *lock = &lock_data->hw_lock->lock;
202 unsigned long irqflags;
201 203
202 spin_lock(&lock_data->spinlock); 204 spin_lock_irqsave(&lock_data->spinlock, irqflags);
203 do { 205 do {
204 old = *lock; 206 old = *lock;
205 if (old & _DRM_LOCK_HELD) 207 if (old & _DRM_LOCK_HELD)
@@ -211,7 +213,7 @@ int drm_lock_take(struct drm_lock_data *lock_data,
211 } 213 }
212 prev = cmpxchg(lock, old, new); 214 prev = cmpxchg(lock, old, new);
213 } while (prev != old); 215 } while (prev != old);
214 spin_unlock(&lock_data->spinlock); 216 spin_unlock_irqrestore(&lock_data->spinlock, irqflags);
215 217
216 if (_DRM_LOCKING_CONTEXT(old) == context) { 218 if (_DRM_LOCKING_CONTEXT(old) == context) {
217 if (old & _DRM_LOCK_HELD) { 219 if (old & _DRM_LOCK_HELD) {
@@ -272,15 +274,16 @@ int drm_lock_free(struct drm_lock_data *lock_data, unsigned int context)
272{ 274{
273 unsigned int old, new, prev; 275 unsigned int old, new, prev;
274 volatile unsigned int *lock = &lock_data->hw_lock->lock; 276 volatile unsigned int *lock = &lock_data->hw_lock->lock;
277 unsigned long irqflags;
275 278
276 spin_lock(&lock_data->spinlock); 279 spin_lock_irqsave(&lock_data->spinlock, irqflags);
277 if (lock_data->kernel_waiters != 0) { 280 if (lock_data->kernel_waiters != 0) {
278 drm_lock_transfer(lock_data, 0); 281 drm_lock_transfer(lock_data, 0);
279 lock_data->idle_has_lock = 1; 282 lock_data->idle_has_lock = 1;
280 spin_unlock(&lock_data->spinlock); 283 spin_unlock_irqrestore(&lock_data->spinlock, irqflags);
281 return 1; 284 return 1;
282 } 285 }
283 spin_unlock(&lock_data->spinlock); 286 spin_unlock_irqrestore(&lock_data->spinlock, irqflags);
284 287
285 do { 288 do {
286 old = *lock; 289 old = *lock;
@@ -344,19 +347,20 @@ static int drm_notifier(void *priv)
344void drm_idlelock_take(struct drm_lock_data *lock_data) 347void drm_idlelock_take(struct drm_lock_data *lock_data)
345{ 348{
346 int ret = 0; 349 int ret = 0;
350 unsigned long irqflags;
347 351
348 spin_lock(&lock_data->spinlock); 352 spin_lock_irqsave(&lock_data->spinlock, irqflags);
349 lock_data->kernel_waiters++; 353 lock_data->kernel_waiters++;
350 if (!lock_data->idle_has_lock) { 354 if (!lock_data->idle_has_lock) {
351 355
352 spin_unlock(&lock_data->spinlock); 356 spin_unlock_irqrestore(&lock_data->spinlock, irqflags);
353 ret = drm_lock_take(lock_data, DRM_KERNEL_CONTEXT); 357 ret = drm_lock_take(lock_data, DRM_KERNEL_CONTEXT);
354 spin_lock(&lock_data->spinlock); 358 spin_lock_irqsave(&lock_data->spinlock, irqflags);
355 359
356 if (ret == 1) 360 if (ret == 1)
357 lock_data->idle_has_lock = 1; 361 lock_data->idle_has_lock = 1;
358 } 362 }
359 spin_unlock(&lock_data->spinlock); 363 spin_unlock_irqrestore(&lock_data->spinlock, irqflags);
360} 364}
361EXPORT_SYMBOL(drm_idlelock_take); 365EXPORT_SYMBOL(drm_idlelock_take);
362 366
@@ -364,8 +368,9 @@ void drm_idlelock_release(struct drm_lock_data *lock_data)
364{ 368{
365 unsigned int old, prev; 369 unsigned int old, prev;
366 volatile unsigned int *lock = &lock_data->hw_lock->lock; 370 volatile unsigned int *lock = &lock_data->hw_lock->lock;
371 unsigned long irqflags;
367 372
368 spin_lock(&lock_data->spinlock); 373 spin_lock_irqsave(&lock_data->spinlock, irqflags);
369 if (--lock_data->kernel_waiters == 0) { 374 if (--lock_data->kernel_waiters == 0) {
370 if (lock_data->idle_has_lock) { 375 if (lock_data->idle_has_lock) {
371 do { 376 do {
@@ -376,7 +381,7 @@ void drm_idlelock_release(struct drm_lock_data *lock_data)
376 lock_data->idle_has_lock = 0; 381 lock_data->idle_has_lock = 0;
377 } 382 }
378 } 383 }
379 spin_unlock(&lock_data->spinlock); 384 spin_unlock_irqrestore(&lock_data->spinlock, irqflags);
380} 385}
381EXPORT_SYMBOL(drm_idlelock_release); 386EXPORT_SYMBOL(drm_idlelock_release);
382 387