aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/i915/i915_drv.c13
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h10
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c5
-rw-r--r--drivers/gpu/drm/i915/i915_gpu_error.h3
-rw-r--r--drivers/gpu/drm/i915/i915_irq.c12
-rw-r--r--drivers/gpu/drm/i915/i915_request.c6
-rw-r--r--drivers/gpu/drm/i915/selftests/intel_hangcheck.c30
7 files changed, 47 insertions, 32 deletions
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 7ce229c6f424..f770be18b2d7 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1866,6 +1866,8 @@ static int i915_resume_switcheroo(struct drm_device *dev)
1866/** 1866/**
1867 * i915_reset - reset chip after a hang 1867 * i915_reset - reset chip after a hang
1868 * @i915: #drm_i915_private to reset 1868 * @i915: #drm_i915_private to reset
1869 * @stalled_mask: mask of the stalled engines with the guilty requests
1870 * @reason: user error message for why we are resetting
1869 * 1871 *
1870 * Reset the chip. Useful if a hang is detected. Marks the device as wedged 1872 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
1871 * on failure. 1873 * on failure.
@@ -1880,7 +1882,9 @@ static int i915_resume_switcheroo(struct drm_device *dev)
1880 * - re-init interrupt state 1882 * - re-init interrupt state
1881 * - re-init display 1883 * - re-init display
1882 */ 1884 */
1883void i915_reset(struct drm_i915_private *i915) 1885void i915_reset(struct drm_i915_private *i915,
1886 unsigned int stalled_mask,
1887 const char *reason)
1884{ 1888{
1885 struct i915_gpu_error *error = &i915->gpu_error; 1889 struct i915_gpu_error *error = &i915->gpu_error;
1886 int ret; 1890 int ret;
@@ -1899,9 +1903,8 @@ void i915_reset(struct drm_i915_private *i915)
1899 if (!i915_gem_unset_wedged(i915)) 1903 if (!i915_gem_unset_wedged(i915))
1900 goto wakeup; 1904 goto wakeup;
1901 1905
1902 if (error->reason) 1906 if (reason)
1903 dev_notice(i915->drm.dev, 1907 dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
1904 "Resetting chip for %s\n", error->reason);
1905 error->reset_count++; 1908 error->reset_count++;
1906 1909
1907 disable_irq(i915->drm.irq); 1910 disable_irq(i915->drm.irq);
@@ -1944,7 +1947,7 @@ void i915_reset(struct drm_i915_private *i915)
1944 goto error; 1947 goto error;
1945 } 1948 }
1946 1949
1947 i915_gem_reset(i915); 1950 i915_gem_reset(i915, stalled_mask);
1948 intel_overlay_reset(i915); 1951 intel_overlay_reset(i915);
1949 1952
1950 /* 1953 /*
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6b3f2f651def..9bca104c409e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2701,8 +2701,11 @@ extern void i915_driver_unload(struct drm_device *dev);
2701extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask); 2701extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
2702extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv); 2702extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
2703 2703
2704extern void i915_reset(struct drm_i915_private *i915); 2704extern void i915_reset(struct drm_i915_private *i915,
2705extern int i915_reset_engine(struct intel_engine_cs *engine, const char *msg); 2705 unsigned int stalled_mask,
2706 const char *reason);
2707extern int i915_reset_engine(struct intel_engine_cs *engine,
2708 const char *reason);
2706 2709
2707extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv); 2710extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
2708extern int intel_reset_guc(struct drm_i915_private *dev_priv); 2711extern int intel_reset_guc(struct drm_i915_private *dev_priv);
@@ -3126,7 +3129,8 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
3126struct i915_request * 3129struct i915_request *
3127i915_gem_reset_prepare_engine(struct intel_engine_cs *engine); 3130i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
3128int i915_gem_reset_prepare(struct drm_i915_private *dev_priv); 3131int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
3129void i915_gem_reset(struct drm_i915_private *dev_priv); 3132void i915_gem_reset(struct drm_i915_private *dev_priv,
3133 unsigned int stalled_mask);
3130void i915_gem_reset_finish_engine(struct intel_engine_cs *engine); 3134void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
3131void i915_gem_reset_finish(struct drm_i915_private *dev_priv); 3135void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
3132void i915_gem_set_wedged(struct drm_i915_private *dev_priv); 3136void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 306d7a805eb7..28ab0beff86c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3213,7 +3213,8 @@ void i915_gem_reset_engine(struct intel_engine_cs *engine,
3213 engine->reset_hw(engine, request); 3213 engine->reset_hw(engine, request);
3214} 3214}
3215 3215
3216void i915_gem_reset(struct drm_i915_private *dev_priv) 3216void i915_gem_reset(struct drm_i915_private *dev_priv,
3217 unsigned int stalled_mask)
3217{ 3218{
3218 struct intel_engine_cs *engine; 3219 struct intel_engine_cs *engine;
3219 enum intel_engine_id id; 3220 enum intel_engine_id id;
@@ -3227,7 +3228,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
3227 3228
3228 i915_gem_reset_engine(engine, 3229 i915_gem_reset_engine(engine,
3229 engine->hangcheck.active_request, 3230 engine->hangcheck.active_request,
3230 engine->hangcheck.stalled); 3231 stalled_mask & ENGINE_MASK(id));
3231 ctx = fetch_and_zero(&engine->last_retired_context); 3232 ctx = fetch_and_zero(&engine->last_retired_context);
3232 if (ctx) 3233 if (ctx)
3233 engine->context_unpin(engine, ctx); 3234 engine->context_unpin(engine, ctx);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index ac5760673cc9..c05b6034d718 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -269,6 +269,9 @@ struct i915_gpu_error {
269 /** Number of times an engine has been reset */ 269 /** Number of times an engine has been reset */
270 u32 reset_engine_count[I915_NUM_ENGINES]; 270 u32 reset_engine_count[I915_NUM_ENGINES];
271 271
272 /** Set of stalled engines with guilty requests, in the current reset */
273 u32 stalled_mask;
274
272 /** Reason for the current *global* reset */ 275 /** Reason for the current *global* reset */
273 const char *reason; 276 const char *reason;
274 277
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index c2f878ace0ea..b03d18561b55 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2961,7 +2961,8 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
2961} 2961}
2962 2962
2963static void i915_reset_device(struct drm_i915_private *dev_priv, 2963static void i915_reset_device(struct drm_i915_private *dev_priv,
2964 const char *msg) 2964 u32 engine_mask,
2965 const char *reason)
2965{ 2966{
2966 struct i915_gpu_error *error = &dev_priv->gpu_error; 2967 struct i915_gpu_error *error = &dev_priv->gpu_error;
2967 struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj; 2968 struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
@@ -2979,9 +2980,11 @@ static void i915_reset_device(struct drm_i915_private *dev_priv,
2979 i915_wedge_on_timeout(&w, dev_priv, 5*HZ) { 2980 i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
2980 intel_prepare_reset(dev_priv); 2981 intel_prepare_reset(dev_priv);
2981 2982
2982 error->reason = msg; 2983 error->reason = reason;
2984 error->stalled_mask = engine_mask;
2983 2985
2984 /* Signal that locked waiters should reset the GPU */ 2986 /* Signal that locked waiters should reset the GPU */
2987 smp_mb__before_atomic();
2985 set_bit(I915_RESET_HANDOFF, &error->flags); 2988 set_bit(I915_RESET_HANDOFF, &error->flags);
2986 wake_up_all(&error->wait_queue); 2989 wake_up_all(&error->wait_queue);
2987 2990
@@ -2990,7 +2993,7 @@ static void i915_reset_device(struct drm_i915_private *dev_priv,
2990 */ 2993 */
2991 do { 2994 do {
2992 if (mutex_trylock(&dev_priv->drm.struct_mutex)) { 2995 if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
2993 i915_reset(dev_priv); 2996 i915_reset(dev_priv, engine_mask, reason);
2994 mutex_unlock(&dev_priv->drm.struct_mutex); 2997 mutex_unlock(&dev_priv->drm.struct_mutex);
2995 } 2998 }
2996 } while (wait_on_bit_timeout(&error->flags, 2999 } while (wait_on_bit_timeout(&error->flags,
@@ -2998,6 +3001,7 @@ static void i915_reset_device(struct drm_i915_private *dev_priv,
2998 TASK_UNINTERRUPTIBLE, 3001 TASK_UNINTERRUPTIBLE,
2999 1)); 3002 1));
3000 3003
3004 error->stalled_mask = 0;
3001 error->reason = NULL; 3005 error->reason = NULL;
3002 3006
3003 intel_finish_reset(dev_priv); 3007 intel_finish_reset(dev_priv);
@@ -3122,7 +3126,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
3122 TASK_UNINTERRUPTIBLE); 3126 TASK_UNINTERRUPTIBLE);
3123 } 3127 }
3124 3128
3125 i915_reset_device(dev_priv, msg); 3129 i915_reset_device(dev_priv, engine_mask, msg);
3126 3130
3127 for_each_engine(engine, dev_priv, tmp) { 3131 for_each_engine(engine, dev_priv, tmp) {
3128 clear_bit(I915_RESET_ENGINE + engine->id, 3132 clear_bit(I915_RESET_ENGINE + engine->id,
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index a9d0bde16443..629f3e860592 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1185,11 +1185,13 @@ static bool __i915_spin_request(const struct i915_request *rq,
1185 1185
1186static bool __i915_wait_request_check_and_reset(struct i915_request *request) 1186static bool __i915_wait_request_check_and_reset(struct i915_request *request)
1187{ 1187{
1188 if (likely(!i915_reset_handoff(&request->i915->gpu_error))) 1188 struct i915_gpu_error *error = &request->i915->gpu_error;
1189
1190 if (likely(!i915_reset_handoff(error)))
1189 return false; 1191 return false;
1190 1192
1191 __set_current_state(TASK_RUNNING); 1193 __set_current_state(TASK_RUNNING);
1192 i915_reset(request->i915); 1194 i915_reset(request->i915, error->stalled_mask, error->reason);
1193 return true; 1195 return true;
1194} 1196}
1195 1197
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index acfb4dcc9fb5..24f913f26a7b 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -437,7 +437,7 @@ static int igt_global_reset(void *arg)
437 mutex_lock(&i915->drm.struct_mutex); 437 mutex_lock(&i915->drm.struct_mutex);
438 reset_count = i915_reset_count(&i915->gpu_error); 438 reset_count = i915_reset_count(&i915->gpu_error);
439 439
440 i915_reset(i915); 440 i915_reset(i915, ALL_ENGINES, NULL);
441 441
442 if (i915_reset_count(&i915->gpu_error) == reset_count) { 442 if (i915_reset_count(&i915->gpu_error) == reset_count) {
443 pr_err("No GPU reset recorded!\n"); 443 pr_err("No GPU reset recorded!\n");
@@ -881,17 +881,18 @@ static int igt_reset_engines(void *arg)
881 return 0; 881 return 0;
882} 882}
883 883
884static u32 fake_hangcheck(struct i915_request *rq) 884static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
885{ 885{
886 u32 reset_count; 886 struct i915_gpu_error *error = &rq->i915->gpu_error;
887 u32 reset_count = i915_reset_count(error);
887 888
888 rq->engine->hangcheck.stalled = true; 889 error->stalled_mask = mask;
889 rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
890 890
891 reset_count = i915_reset_count(&rq->i915->gpu_error); 891 /* set_bit() must be after we have setup the backchannel (mask) */
892 smp_mb__before_atomic();
893 set_bit(I915_RESET_HANDOFF, &error->flags);
892 894
893 set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags); 895 wake_up_all(&error->wait_queue);
894 wake_up_all(&rq->i915->gpu_error.wait_queue);
895 896
896 return reset_count; 897 return reset_count;
897} 898}
@@ -939,7 +940,7 @@ static int igt_wait_reset(void *arg)
939 goto out_rq; 940 goto out_rq;
940 } 941 }
941 942
942 reset_count = fake_hangcheck(rq); 943 reset_count = fake_hangcheck(rq, ALL_ENGINES);
943 944
944 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10); 945 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
945 if (timeout < 0) { 946 if (timeout < 0) {
@@ -1075,9 +1076,9 @@ static int igt_reset_queue(void *arg)
1075 goto fini; 1076 goto fini;
1076 } 1077 }
1077 1078
1078 reset_count = fake_hangcheck(prev); 1079 reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1079 1080
1080 i915_reset(i915); 1081 i915_reset(i915, ENGINE_MASK(id), NULL);
1081 1082
1082 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, 1083 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1083 &i915->gpu_error.flags)); 1084 &i915->gpu_error.flags));
@@ -1150,7 +1151,7 @@ static int igt_handle_error(void *arg)
1150 if (!intel_has_reset_engine(i915)) 1151 if (!intel_has_reset_engine(i915))
1151 return 0; 1152 return 0;
1152 1153
1153 if (!intel_engine_can_store_dword(i915->engine[RCS])) 1154 if (!engine || !intel_engine_can_store_dword(engine))
1154 return 0; 1155 return 0;
1155 1156
1156 mutex_lock(&i915->drm.struct_mutex); 1157 mutex_lock(&i915->drm.struct_mutex);
@@ -1186,10 +1187,7 @@ static int igt_handle_error(void *arg)
1186 /* Temporarily disable error capture */ 1187 /* Temporarily disable error capture */
1187 error = xchg(&i915->gpu_error.first_error, (void *)-1); 1188 error = xchg(&i915->gpu_error.first_error, (void *)-1);
1188 1189
1189 engine->hangcheck.stalled = true; 1190 i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1190 engine->hangcheck.seqno = intel_engine_get_seqno(engine);
1191
1192 i915_handle_error(i915, intel_engine_flag(engine), 0, NULL);
1193 1191
1194 xchg(&i915->gpu_error.first_error, error); 1192 xchg(&i915->gpu_error.first_error, error);
1195 1193