diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2014-02-25 10:11:24 -0500 |
---|---|---|
committer | Daniel Vetter <daniel.vetter@ffwll.ch> | 2014-03-05 15:30:24 -0500 |
commit | ab0e7ff9f2d0bfe139a2ed5bb6a36f8cbd4e0886 (patch) | |
tree | 66a0a24af927cffbb34a25d0c0190d1a4507cecc /drivers/gpu/drm/i915 | |
parent | 8d9fc7fd2de6edc3b9c3f828a701bfa6891987e7 (diff) |
drm/i915: Record pid/comm of hanging task
After finding the guilty batch and request, we can use it to find the
process that submitted the batch and then add the culprit into the error
state.
This is a slightly different approach from Ben's in that instead of
adding the extra information into the struct i915_hw_context, we use the
information already captured in struct drm_file which is then referenced
from the request.
v2: Also capture the workaround buffer for gen2, so that we can compare
its contents against the intended batch for the active request.
v3: Rebase (Mika)
v4: Check for null context (Chris)
checkpatch warnings fixed
Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v2)
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v4)
Acked-by: Ben Widawsky <ben@bwidawsk.net>
Cc: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Diffstat (limited to 'drivers/gpu/drm/i915')
-rw-r--r-- | drivers/gpu/drm/i915/i915_drv.h | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_gem.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_gpu_error.c | 136 |
3 files changed, 86 insertions, 57 deletions
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index fe4427be2e03..826fcaef25c1 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h | |||
@@ -360,7 +360,7 @@ struct drm_i915_error_state { | |||
360 | int page_count; | 360 | int page_count; |
361 | u32 gtt_offset; | 361 | u32 gtt_offset; |
362 | u32 *pages[0]; | 362 | u32 *pages[0]; |
363 | } *ringbuffer, *batchbuffer, *ctx, *hws_page; | 363 | } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page; |
364 | 364 | ||
365 | struct drm_i915_error_request { | 365 | struct drm_i915_error_request { |
366 | long jiffies; | 366 | long jiffies; |
@@ -375,6 +375,9 @@ struct drm_i915_error_state { | |||
375 | u32 pp_dir_base; | 375 | u32 pp_dir_base; |
376 | }; | 376 | }; |
377 | } vm_info; | 377 | } vm_info; |
378 | |||
379 | pid_t pid; | ||
380 | char comm[TASK_COMM_LEN]; | ||
378 | } ring[I915_NUM_RINGS]; | 381 | } ring[I915_NUM_RINGS]; |
379 | struct drm_i915_error_buffer { | 382 | struct drm_i915_error_buffer { |
380 | u32 size; | 383 | u32 size; |
@@ -1797,6 +1800,7 @@ struct drm_i915_gem_request { | |||
1797 | 1800 | ||
1798 | struct drm_i915_file_private { | 1801 | struct drm_i915_file_private { |
1799 | struct drm_i915_private *dev_priv; | 1802 | struct drm_i915_private *dev_priv; |
1803 | struct drm_file *file; | ||
1800 | 1804 | ||
1801 | struct { | 1805 | struct { |
1802 | spinlock_t lock; | 1806 | spinlock_t lock; |
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c5a182be2eb0..6e17b45db850 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c | |||
@@ -4857,6 +4857,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file) | |||
4857 | 4857 | ||
4858 | file->driver_priv = file_priv; | 4858 | file->driver_priv = file_priv; |
4859 | file_priv->dev_priv = dev->dev_private; | 4859 | file_priv->dev_priv = dev->dev_private; |
4860 | file_priv->file = file; | ||
4860 | 4861 | ||
4861 | spin_lock_init(&file_priv->mm.lock); | 4862 | spin_lock_init(&file_priv->mm.lock); |
4862 | INIT_LIST_HEAD(&file_priv->mm.request_list); | 4863 | INIT_LIST_HEAD(&file_priv->mm.request_list); |
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index eed1b34eaf47..8b02498ee963 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c | |||
@@ -301,13 +301,28 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) | |||
301 | va_end(args); | 301 | va_end(args); |
302 | } | 302 | } |
303 | 303 | ||
304 | static void print_error_obj(struct drm_i915_error_state_buf *m, | ||
305 | struct drm_i915_error_object *obj) | ||
306 | { | ||
307 | int page, offset, elt; | ||
308 | |||
309 | for (page = offset = 0; page < obj->page_count; page++) { | ||
310 | for (elt = 0; elt < PAGE_SIZE/4; elt++) { | ||
311 | err_printf(m, "%08x : %08x\n", offset, | ||
312 | obj->pages[page][elt]); | ||
313 | offset += 4; | ||
314 | } | ||
315 | } | ||
316 | } | ||
317 | |||
304 | int i915_error_state_to_str(struct drm_i915_error_state_buf *m, | 318 | int i915_error_state_to_str(struct drm_i915_error_state_buf *m, |
305 | const struct i915_error_state_file_priv *error_priv) | 319 | const struct i915_error_state_file_priv *error_priv) |
306 | { | 320 | { |
307 | struct drm_device *dev = error_priv->dev; | 321 | struct drm_device *dev = error_priv->dev; |
308 | drm_i915_private_t *dev_priv = dev->dev_private; | 322 | drm_i915_private_t *dev_priv = dev->dev_private; |
309 | struct drm_i915_error_state *error = error_priv->error; | 323 | struct drm_i915_error_state *error = error_priv->error; |
310 | int i, j, page, offset, elt; | 324 | int i, j, offset, elt; |
325 | int max_hangcheck_score; | ||
311 | 326 | ||
312 | if (!error) { | 327 | if (!error) { |
313 | err_printf(m, "no error state collected\n"); | 328 | err_printf(m, "no error state collected\n"); |
@@ -317,6 +332,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, | |||
317 | err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec, | 332 | err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec, |
318 | error->time.tv_usec); | 333 | error->time.tv_usec); |
319 | err_printf(m, "Kernel: " UTS_RELEASE "\n"); | 334 | err_printf(m, "Kernel: " UTS_RELEASE "\n"); |
335 | max_hangcheck_score = 0; | ||
336 | for (i = 0; i < ARRAY_SIZE(error->ring); i++) { | ||
337 | if (error->ring[i].hangcheck_score > max_hangcheck_score) | ||
338 | max_hangcheck_score = error->ring[i].hangcheck_score; | ||
339 | } | ||
340 | for (i = 0; i < ARRAY_SIZE(error->ring); i++) { | ||
341 | if (error->ring[i].hangcheck_score == max_hangcheck_score && | ||
342 | error->ring[i].pid != -1) { | ||
343 | err_printf(m, "Active process (on ring %s): %s [%d]\n", | ||
344 | ring_str(i), | ||
345 | error->ring[i].comm, | ||
346 | error->ring[i].pid); | ||
347 | } | ||
348 | } | ||
320 | err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device); | 349 | err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device); |
321 | err_printf(m, "EIR: 0x%08x\n", error->eir); | 350 | err_printf(m, "EIR: 0x%08x\n", error->eir); |
322 | err_printf(m, "IER: 0x%08x\n", error->ier); | 351 | err_printf(m, "IER: 0x%08x\n", error->ier); |
@@ -359,18 +388,23 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, | |||
359 | for (i = 0; i < ARRAY_SIZE(error->ring); i++) { | 388 | for (i = 0; i < ARRAY_SIZE(error->ring); i++) { |
360 | struct drm_i915_error_object *obj; | 389 | struct drm_i915_error_object *obj; |
361 | 390 | ||
362 | if ((obj = error->ring[i].batchbuffer)) { | 391 | obj = error->ring[i].batchbuffer; |
363 | err_printf(m, "%s --- gtt_offset = 0x%08x\n", | 392 | if (obj) { |
364 | dev_priv->ring[i].name, | 393 | err_puts(m, dev_priv->ring[i].name); |
394 | if (error->ring[i].pid != -1) | ||
395 | err_printf(m, " (submitted by %s [%d])", | ||
396 | error->ring[i].comm, | ||
397 | error->ring[i].pid); | ||
398 | err_printf(m, " --- gtt_offset = 0x%08x\n", | ||
365 | obj->gtt_offset); | 399 | obj->gtt_offset); |
366 | offset = 0; | 400 | print_error_obj(m, obj); |
367 | for (page = 0; page < obj->page_count; page++) { | 401 | } |
368 | for (elt = 0; elt < PAGE_SIZE/4; elt++) { | 402 | |
369 | err_printf(m, "%08x : %08x\n", offset, | 403 | obj = error->ring[i].wa_batchbuffer; |
370 | obj->pages[page][elt]); | 404 | if (obj) { |
371 | offset += 4; | 405 | err_printf(m, "%s (w/a) --- gtt_offset = 0x%08x\n", |
372 | } | 406 | dev_priv->ring[i].name, obj->gtt_offset); |
373 | } | 407 | print_error_obj(m, obj); |
374 | } | 408 | } |
375 | 409 | ||
376 | if (error->ring[i].num_requests) { | 410 | if (error->ring[i].num_requests) { |
@@ -389,15 +423,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, | |||
389 | err_printf(m, "%s --- ringbuffer = 0x%08x\n", | 423 | err_printf(m, "%s --- ringbuffer = 0x%08x\n", |
390 | dev_priv->ring[i].name, | 424 | dev_priv->ring[i].name, |
391 | obj->gtt_offset); | 425 | obj->gtt_offset); |
392 | offset = 0; | 426 | print_error_obj(m, obj); |
393 | for (page = 0; page < obj->page_count; page++) { | ||
394 | for (elt = 0; elt < PAGE_SIZE/4; elt++) { | ||
395 | err_printf(m, "%08x : %08x\n", | ||
396 | offset, | ||
397 | obj->pages[page][elt]); | ||
398 | offset += 4; | ||
399 | } | ||
400 | } | ||
401 | } | 427 | } |
402 | 428 | ||
403 | if ((obj = error->ring[i].hws_page)) { | 429 | if ((obj = error->ring[i].hws_page)) { |
@@ -713,39 +739,6 @@ static void i915_gem_record_fences(struct drm_device *dev, | |||
713 | } | 739 | } |
714 | } | 740 | } |
715 | 741 | ||
716 | static struct drm_i915_error_object * | ||
717 | i915_error_first_batchbuffer(struct drm_i915_private *dev_priv, | ||
718 | struct intel_ring_buffer *ring) | ||
719 | { | ||
720 | struct drm_i915_gem_request *request; | ||
721 | |||
722 | if (HAS_BROKEN_CS_TLB(dev_priv->dev)) { | ||
723 | struct drm_i915_gem_object *obj; | ||
724 | u32 acthd = I915_READ(ACTHD); | ||
725 | |||
726 | if (WARN_ON(ring->id != RCS)) | ||
727 | return NULL; | ||
728 | |||
729 | obj = ring->scratch.obj; | ||
730 | if (obj != NULL && | ||
731 | acthd >= i915_gem_obj_ggtt_offset(obj) && | ||
732 | acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size) | ||
733 | return i915_error_ggtt_object_create(dev_priv, obj); | ||
734 | } | ||
735 | |||
736 | request = i915_gem_find_active_request(ring); | ||
737 | if (request == NULL) | ||
738 | return NULL; | ||
739 | |||
740 | /* We need to copy these to an anonymous buffer as the simplest | ||
741 | * method to avoid being overwritten by userspace. | ||
742 | */ | ||
743 | return i915_error_object_create(dev_priv, request->batch_obj, | ||
744 | request->ctx ? | ||
745 | request->ctx->vm : | ||
746 | &dev_priv->gtt.base); | ||
747 | } | ||
748 | |||
749 | static void i915_record_ring_state(struct drm_device *dev, | 742 | static void i915_record_ring_state(struct drm_device *dev, |
750 | struct intel_ring_buffer *ring, | 743 | struct intel_ring_buffer *ring, |
751 | struct drm_i915_error_ring *ering) | 744 | struct drm_i915_error_ring *ering) |
@@ -894,8 +887,39 @@ static void i915_gem_record_rings(struct drm_device *dev, | |||
894 | 887 | ||
895 | i915_record_ring_state(dev, ring, &error->ring[i]); | 888 | i915_record_ring_state(dev, ring, &error->ring[i]); |
896 | 889 | ||
897 | error->ring[i].batchbuffer = | 890 | error->ring[i].pid = -1; |
898 | i915_error_first_batchbuffer(dev_priv, ring); | 891 | request = i915_gem_find_active_request(ring); |
892 | if (request) { | ||
893 | /* We need to copy these to an anonymous buffer | ||
894 | * as the simplest method to avoid being overwritten | ||
895 | * by userspace. | ||
896 | */ | ||
897 | error->ring[i].batchbuffer = | ||
898 | i915_error_object_create(dev_priv, | ||
899 | request->batch_obj, | ||
900 | request->ctx ? | ||
901 | request->ctx->vm : | ||
902 | &dev_priv->gtt.base); | ||
903 | |||
904 | if (HAS_BROKEN_CS_TLB(dev_priv->dev) && | ||
905 | ring->scratch.obj) | ||
906 | error->ring[i].wa_batchbuffer = | ||
907 | i915_error_ggtt_object_create(dev_priv, | ||
908 | ring->scratch.obj); | ||
909 | |||
910 | if (request->file_priv) { | ||
911 | struct task_struct *task; | ||
912 | |||
913 | rcu_read_lock(); | ||
914 | task = pid_task(request->file_priv->file->pid, | ||
915 | PIDTYPE_PID); | ||
916 | if (task) { | ||
917 | strcpy(error->ring[i].comm, task->comm); | ||
918 | error->ring[i].pid = task->pid; | ||
919 | } | ||
920 | rcu_read_unlock(); | ||
921 | } | ||
922 | } | ||
899 | 923 | ||
900 | error->ring[i].ringbuffer = | 924 | error->ring[i].ringbuffer = |
901 | i915_error_ggtt_object_create(dev_priv, ring->obj); | 925 | i915_error_ggtt_object_create(dev_priv, ring->obj); |