aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Widawsky <benjamin.widawsky@intel.com>2014-02-04 07:18:55 -0500
committerDaniel Vetter <daniel.vetter@ffwll.ch>2014-02-05 11:17:10 -0500
commit011cf577b2531dfbd2254bd9ec147ad71471abaf (patch)
tree0ec93e2cd7eb2e95e5bb8e3c1e6c4d86beca95a4
parent579a9b0e72e954d6bebcd193460ffb2ebac8e4fe (diff)
drm/i915: Generate a hang error code
We get a large number of bugs which have a, "hey I have that too" because they see a GPU hang in dmesg. While two machines of the same model having a GPU hang is indeed a coincidence, it is far from enough evidence to suggest they are the same. In order to reduce this effect, and hopefully get people to file new bug reports, clearly the error message itself has been insufficient (see ref at the bottom for a new bug report with this characteristic). The algorithm is purposely pretty naive. I don't think we need much in order to avoid the problem I am trying to solve, and keeping it naive gives us some ability to make a decent test case. Cc: Jesse Barnes <jbarnes@virtuousgeek.org> References: https://bugs.freedesktop.org/show_bug.cgi?id=73276 Signed-off-by: Ben Widawsky <ben@bwidawsk.net> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
-rw-r--r--drivers/gpu/drm/i915/i915_gpu_error.c44
1 files changed, 37 insertions, 7 deletions
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 94542d498296..dc47bb9742d2 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -653,6 +653,33 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
653 return i; 653 return i;
654} 654}
655 655
656/* Generate a semi-unique error code. The code is not meant to have meaning, The
657 * code's only purpose is to try to prevent false duplicated bug reports by
658 * grossly estimating a GPU error state.
659 *
660 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
661 * the hang if we could strip the GTT offset information from it.
662 *
663 * It's only a small step better than a random number in its current form.
664 */
665static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
666 struct drm_i915_error_state *error)
667{
668 uint32_t error_code = 0;
669 int i;
670
671 /* IPEHR would be an ideal way to detect errors, as it's the gross
672 * measure of "the command that hung." However, has some very common
673 * synchronization commands which almost always appear in the case
674 * strictly a client bug. Use instdone to differentiate those some.
675 */
676 for (i = 0; i < I915_NUM_RINGS; i++)
677 if (error->ring[i].hangcheck_action == HANGCHECK_HUNG)
678 return error->ring[i].ipehr ^ error->ring[i].instdone;
679
680 return error_code;
681}
682
656static void i915_gem_record_fences(struct drm_device *dev, 683static void i915_gem_record_fences(struct drm_device *dev,
657 struct drm_i915_error_state *error) 684 struct drm_i915_error_state *error)
658{ 685{
@@ -1098,6 +1125,7 @@ void i915_capture_error_state(struct drm_device *dev)
1098 struct drm_i915_private *dev_priv = dev->dev_private; 1125 struct drm_i915_private *dev_priv = dev->dev_private;
1099 struct drm_i915_error_state *error; 1126 struct drm_i915_error_state *error;
1100 unsigned long flags; 1127 unsigned long flags;
1128 uint32_t ecode;
1101 1129
1102 spin_lock_irqsave(&dev_priv->gpu_error.lock, flags); 1130 spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
1103 error = dev_priv->gpu_error.first_error; 1131 error = dev_priv->gpu_error.first_error;
@@ -1114,7 +1142,16 @@ void i915_capture_error_state(struct drm_device *dev)
1114 1142
1115 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", 1143 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1116 dev->primary->index); 1144 dev->primary->index);
1145 kref_init(&error->ref);
1146
1147 i915_capture_reg_state(dev_priv, error);
1148 i915_gem_capture_buffers(dev_priv, error);
1149 i915_gem_record_fences(dev, error);
1150 i915_gem_record_rings(dev, error);
1151 ecode = i915_error_generate_code(dev_priv, error);
1152
1117 if (!warned) { 1153 if (!warned) {
1154 DRM_INFO("GPU HANG [%x]\n", ecode);
1118 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); 1155 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1119 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); 1156 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1120 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); 1157 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
@@ -1122,13 +1159,6 @@ void i915_capture_error_state(struct drm_device *dev)
1122 warned = true; 1159 warned = true;
1123 } 1160 }
1124 1161
1125 kref_init(&error->ref);
1126
1127 i915_capture_reg_state(dev_priv, error);
1128 i915_gem_capture_buffers(dev_priv, error);
1129 i915_gem_record_fences(dev, error);
1130 i915_gem_record_rings(dev, error);
1131
1132 do_gettimeofday(&error->time); 1162 do_gettimeofday(&error->time);
1133 1163
1134 error->overlay = intel_overlay_capture_error_state(dev); 1164 error->overlay = intel_overlay_capture_error_state(dev);