aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2018-02-05 04:41:39 -0500
committerChris Wilson <chris@chris-wilson.co.uk>2018-02-05 05:59:22 -0500
commit302e55d7be959502058878e9edb1d369a73598d4 (patch)
tree133028c3e09de8641c07748322b3371ae4df0a77
parent55ef72f24fae5d41febe6b6ebf7304f9a2cb9471 (diff)
drm/i915: Report if an unbannable context is involved in a GPU hang
Since unbannable contexts are special and supposed not to be causing GPU hangs in the first place, make it clear when they are implicated in said hang. In practice, most unbannable contexts are those created by igt for the express purpose of throwing untold thousands of hangs at the GPU and wish to keep doing so to finish the test. Normally they are cleaned up, but it's when they or the other unbannable kernel contexts stay stuck in an erroneous state that we need to worry and so need highlighting. Suggested-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180205094139.10671-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h1
-rw-r--r--drivers/gpu/drm/i915/i915_gpu_error.c21
2 files changed, 16 insertions, 6 deletions
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5ed220e28402..a241620f22ad 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -555,6 +555,7 @@ struct i915_gpu_state {
555 int ban_score; 555 int ban_score;
556 int active; 556 int active;
557 int guilty; 557 int guilty;
558 bool bannable;
558 } context; 559 } context;
559 560
560 struct drm_i915_error_object { 561 struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index a81351d9e3a6..67c902412193 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -396,6 +396,11 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
396 ee->instdone.row[slice][subslice]); 396 ee->instdone.row[slice][subslice]);
397} 397}
398 398
399static const char *bannable(const struct drm_i915_error_context *ctx)
400{
401 return ctx->bannable ? "" : " (unbannable)";
402}
403
399static void error_print_request(struct drm_i915_error_state_buf *m, 404static void error_print_request(struct drm_i915_error_state_buf *m,
400 const char *prefix, 405 const char *prefix,
401 const struct drm_i915_error_request *erq) 406 const struct drm_i915_error_request *erq)
@@ -414,9 +419,10 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
414 const char *header, 419 const char *header,
415 const struct drm_i915_error_context *ctx) 420 const struct drm_i915_error_context *ctx)
416{ 421{
417 err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d guilty %d active %d\n", 422 err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
418 header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, 423 header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
419 ctx->priority, ctx->ban_score, ctx->guilty, ctx->active); 424 ctx->priority, ctx->ban_score, bannable(ctx),
425 ctx->guilty, ctx->active);
420} 426}
421 427
422static void error_print_engine(struct drm_i915_error_state_buf *m, 428static void error_print_engine(struct drm_i915_error_state_buf *m,
@@ -644,11 +650,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
644 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 650 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
645 if (error->engine[i].hangcheck_stalled && 651 if (error->engine[i].hangcheck_stalled &&
646 error->engine[i].context.pid) { 652 error->engine[i].context.pid) {
647 err_printf(m, "Active process (on ring %s): %s [%d], score %d\n", 653 err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
648 engine_name(m->i915, i), 654 engine_name(m->i915, i),
649 error->engine[i].context.comm, 655 error->engine[i].context.comm,
650 error->engine[i].context.pid, 656 error->engine[i].context.pid,
651 error->engine[i].context.ban_score); 657 error->engine[i].context.ban_score,
658 bannable(&error->engine[i].context));
652 } 659 }
653 } 660 }
654 err_printf(m, "Reset count: %u\n", error->reset_count); 661 err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -736,12 +743,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
736 if (obj) { 743 if (obj) {
737 err_puts(m, dev_priv->engine[i]->name); 744 err_puts(m, dev_priv->engine[i]->name);
738 if (ee->context.pid) 745 if (ee->context.pid)
739 err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)", 746 err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
740 ee->context.comm, 747 ee->context.comm,
741 ee->context.pid, 748 ee->context.pid,
742 ee->context.handle, 749 ee->context.handle,
743 ee->context.hw_id, 750 ee->context.hw_id,
744 ee->context.ban_score); 751 ee->context.ban_score,
752 bannable(&ee->context));
745 err_printf(m, " --- gtt_offset = 0x%08x %08x\n", 753 err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
746 upper_32_bits(obj->gtt_offset), 754 upper_32_bits(obj->gtt_offset),
747 lower_32_bits(obj->gtt_offset)); 755 lower_32_bits(obj->gtt_offset));
@@ -1383,6 +1391,7 @@ static void record_context(struct drm_i915_error_context *e,
1383 e->hw_id = ctx->hw_id; 1391 e->hw_id = ctx->hw_id;
1384 e->priority = ctx->priority; 1392 e->priority = ctx->priority;
1385 e->ban_score = atomic_read(&ctx->ban_score); 1393 e->ban_score = atomic_read(&ctx->ban_score);
1394 e->bannable = i915_gem_context_is_bannable(ctx);
1386 e->guilty = atomic_read(&ctx->guilty_count); 1395 e->guilty = atomic_read(&ctx->guilty_count);
1387 e->active = atomic_read(&ctx->active_count); 1396 e->active = atomic_read(&ctx->active_count);
1388} 1397}