aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2013-09-25 12:34:55 -0400
committerDaniel Vetter <daniel.vetter@ffwll.ch>2013-10-03 14:01:30 -0400
commit094f9a54e35500739da185cdb78f2e92fc379458 (patch)
tree52487590c65d652f6915308746eacad8fb1aceec
parentcbb47d179fb345c579cd8cd884693903fceed26a (diff)
drm/i915: Fix __wait_seqno to use true infinite timeouts
When we switched to always using a timeout in conjunction with wait_seqno, we lost the ability to detect missed interrupts. Since, we have had issues with interrupts on a number of generations, and they are required to be delivered in a timely fashion for a smooth UX, it is important that we do log errors found in the wild and prevent the display stalling for upwards of 1s every time the seqno interrupt is missed. Rather than continue to fix up the timeouts to work around the interface impedence in wait_event_*(), open code the combination of wait_event[_interruptible][_timeout], and use the exposed timer to poll for seqno should we detect a lost interrupt. v2: In order to satisfy the debug requirement of logging missed interrupts with the real world requirments of making machines work even if interrupts are hosed, we revert to polling after detecting a missed interrupt. v3: Throw in a debugfs interface to simulate broken hw not reporting interrupts. v4: s/EGAIN/EAGAIN/ (Imre) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Imre Deak <imre.deak@intel.com> [danvet: Don't use the struct typedef in new code.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
-rw-r--r--drivers/gpu/drm/i915/i915_debugfs.c68
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h6
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c114
-rw-r--r--drivers/gpu/drm/i915/i915_gpu_error.c1
-rw-r--r--drivers/gpu/drm/i915/i915_irq.c11
5 files changed, 149 insertions, 51 deletions
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fcfa98844ccc..bc5c04d5890f 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1897,6 +1897,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
1897 i915_ring_stop_get, i915_ring_stop_set, 1897 i915_ring_stop_get, i915_ring_stop_set,
1898 "0x%08llx\n"); 1898 "0x%08llx\n");
1899 1899
1900static int
1901i915_ring_missed_irq_get(void *data, u64 *val)
1902{
1903 struct drm_device *dev = data;
1904 struct drm_i915_private *dev_priv = dev->dev_private;
1905
1906 *val = dev_priv->gpu_error.missed_irq_rings;
1907 return 0;
1908}
1909
1910static int
1911i915_ring_missed_irq_set(void *data, u64 val)
1912{
1913 struct drm_device *dev = data;
1914 struct drm_i915_private *dev_priv = dev->dev_private;
1915 int ret;
1916
1917 /* Lock against concurrent debugfs callers */
1918 ret = mutex_lock_interruptible(&dev->struct_mutex);
1919 if (ret)
1920 return ret;
1921 dev_priv->gpu_error.missed_irq_rings = val;
1922 mutex_unlock(&dev->struct_mutex);
1923
1924 return 0;
1925}
1926
1927DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
1928 i915_ring_missed_irq_get, i915_ring_missed_irq_set,
1929 "0x%08llx\n");
1930
1931static int
1932i915_ring_test_irq_get(void *data, u64 *val)
1933{
1934 struct drm_device *dev = data;
1935 struct drm_i915_private *dev_priv = dev->dev_private;
1936
1937 *val = dev_priv->gpu_error.test_irq_rings;
1938
1939 return 0;
1940}
1941
1942static int
1943i915_ring_test_irq_set(void *data, u64 val)
1944{
1945 struct drm_device *dev = data;
1946 struct drm_i915_private *dev_priv = dev->dev_private;
1947 int ret;
1948
1949 DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
1950
1951 /* Lock against concurrent debugfs callers */
1952 ret = mutex_lock_interruptible(&dev->struct_mutex);
1953 if (ret)
1954 return ret;
1955
1956 dev_priv->gpu_error.test_irq_rings = val;
1957 mutex_unlock(&dev->struct_mutex);
1958
1959 return 0;
1960}
1961
1962DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
1963 i915_ring_test_irq_get, i915_ring_test_irq_set,
1964 "0x%08llx\n");
1965
1900#define DROP_UNBOUND 0x1 1966#define DROP_UNBOUND 0x1
1901#define DROP_BOUND 0x2 1967#define DROP_BOUND 0x2
1902#define DROP_RETIRE 0x4 1968#define DROP_RETIRE 0x4
@@ -2290,6 +2356,8 @@ static struct i915_debugfs_files {
2290 {"i915_min_freq", &i915_min_freq_fops}, 2356 {"i915_min_freq", &i915_min_freq_fops},
2291 {"i915_cache_sharing", &i915_cache_sharing_fops}, 2357 {"i915_cache_sharing", &i915_cache_sharing_fops},
2292 {"i915_ring_stop", &i915_ring_stop_fops}, 2358 {"i915_ring_stop", &i915_ring_stop_fops},
2359 {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
2360 {"i915_ring_test_irq", &i915_ring_test_irq_fops},
2293 {"i915_gem_drop_caches", &i915_drop_caches_fops}, 2361 {"i915_gem_drop_caches", &i915_drop_caches_fops},
2294 {"i915_error_state", &i915_error_state_fops}, 2362 {"i915_error_state", &i915_error_state_fops},
2295 {"i915_next_seqno", &i915_next_seqno_fops}, 2363 {"i915_next_seqno", &i915_next_seqno_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 08e96a8c01aa..79bbcf925e4a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1013,6 +1013,9 @@ struct i915_gpu_error {
1013 struct drm_i915_error_state *first_error; 1013 struct drm_i915_error_state *first_error;
1014 struct work_struct work; 1014 struct work_struct work;
1015 1015
1016
1017 unsigned long missed_irq_rings;
1018
1016 /** 1019 /**
1017 * State variable and reset counter controlling the reset flow 1020 * State variable and reset counter controlling the reset flow
1018 * 1021 *
@@ -1051,6 +1054,9 @@ struct i915_gpu_error {
1051 1054
1052 /* For gpu hang simulation. */ 1055 /* For gpu hang simulation. */
1053 unsigned int stop_rings; 1056 unsigned int stop_rings;
1057
1058 /* For missed irq/seqno simulation. */
1059 unsigned int test_irq_rings;
1054}; 1060};
1055 1061
1056enum modeset_restore { 1062enum modeset_restore {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3ae925b0045f..53e315131700 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -971,6 +971,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
971 return ret; 971 return ret;
972} 972}
973 973
974static void fake_irq(unsigned long data)
975{
976 wake_up_process((struct task_struct *)data);
977}
978
979static bool missed_irq(struct drm_i915_private *dev_priv,
980 struct intel_ring_buffer *ring)
981{
982 return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
983}
984
974/** 985/**
975 * __wait_seqno - wait until execution of seqno has finished 986 * __wait_seqno - wait until execution of seqno has finished
976 * @ring: the ring expected to report seqno 987 * @ring: the ring expected to report seqno
@@ -994,10 +1005,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
994 bool interruptible, struct timespec *timeout) 1005 bool interruptible, struct timespec *timeout)
995{ 1006{
996 drm_i915_private_t *dev_priv = ring->dev->dev_private; 1007 drm_i915_private_t *dev_priv = ring->dev->dev_private;
997 struct timespec before, now, wait_time={1,0}; 1008 struct timespec before, now;
998 unsigned long timeout_jiffies; 1009 DEFINE_WAIT(wait);
999 long end; 1010 long timeout_jiffies;
1000 bool wait_forever = true;
1001 int ret; 1011 int ret;
1002 1012
1003 WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n"); 1013 WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
@@ -1005,51 +1015,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
1005 if (i915_seqno_passed(ring->get_seqno(ring, true), seqno)) 1015 if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
1006 return 0; 1016 return 0;
1007 1017
1008 trace_i915_gem_request_wait_begin(ring, seqno); 1018 timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
1009
1010 if (timeout != NULL) {
1011 wait_time = *timeout;
1012 wait_forever = false;
1013 }
1014
1015 timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
1016 1019
1017 if (WARN_ON(!ring->irq_get(ring))) 1020 if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
1021 WARN_ON(!ring->irq_get(ring)))
1018 return -ENODEV; 1022 return -ENODEV;
1019 1023
1020 /* Record current time in case interrupted by signal, or wedged * */ 1024 /* Record current time in case interrupted by signal, or wedged */
1025 trace_i915_gem_request_wait_begin(ring, seqno);
1021 getrawmonotonic(&before); 1026 getrawmonotonic(&before);
1027 for (;;) {
1028 struct timer_list timer;
1029 unsigned long expire;
1022 1030
1023#define EXIT_COND \ 1031 prepare_to_wait(&ring->irq_queue, &wait,
1024 (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ 1032 interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
1025 i915_reset_in_progress(&dev_priv->gpu_error) || \
1026 reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
1027 do {
1028 if (interruptible)
1029 end = wait_event_interruptible_timeout(ring->irq_queue,
1030 EXIT_COND,
1031 timeout_jiffies);
1032 else
1033 end = wait_event_timeout(ring->irq_queue, EXIT_COND,
1034 timeout_jiffies);
1035 1033
1036 /* We need to check whether any gpu reset happened in between 1034 /* We need to check whether any gpu reset happened in between
1037 * the caller grabbing the seqno and now ... */ 1035 * the caller grabbing the seqno and now ... */
1038 if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) 1036 if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
1039 end = -EAGAIN; 1037 /* ... but upgrade the -EAGAIN to an -EIO if the gpu
1038 * is truely gone. */
1039 ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
1040 if (ret == 0)
1041 ret = -EAGAIN;
1042 break;
1043 }
1040 1044
1041 /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely 1045 if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
1042 * gone. */ 1046 ret = 0;
1043 ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); 1047 break;
1044 if (ret) 1048 }
1045 end = ret; 1049
1046 } while (end == 0 && wait_forever); 1050 if (interruptible && signal_pending(current)) {
1051 ret = -ERESTARTSYS;
1052 break;
1053 }
1054
1055 if (timeout_jiffies <= 0) {
1056 ret = -ETIME;
1057 break;
1058 }
1047 1059
1060 timer.function = NULL;
1061 if (timeout || missed_irq(dev_priv, ring)) {
1062 setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
1063 expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies);
1064 mod_timer(&timer, expire);
1065 }
1066
1067 schedule();
1068
1069 if (timeout)
1070 timeout_jiffies = expire - jiffies;
1071
1072 if (timer.function) {
1073 del_singleshot_timer_sync(&timer);
1074 destroy_timer_on_stack(&timer);
1075 }
1076 }
1048 getrawmonotonic(&now); 1077 getrawmonotonic(&now);
1078 trace_i915_gem_request_wait_end(ring, seqno);
1049 1079
1050 ring->irq_put(ring); 1080 ring->irq_put(ring);
1051 trace_i915_gem_request_wait_end(ring, seqno); 1081
1052#undef EXIT_COND 1082 finish_wait(&ring->irq_queue, &wait);
1053 1083
1054 if (timeout) { 1084 if (timeout) {
1055 struct timespec sleep_time = timespec_sub(now, before); 1085 struct timespec sleep_time = timespec_sub(now, before);
@@ -1058,17 +1088,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
1058 set_normalized_timespec(timeout, 0, 0); 1088 set_normalized_timespec(timeout, 0, 0);
1059 } 1089 }
1060 1090
1061 switch (end) { 1091 return ret;
1062 case -EIO:
1063 case -EAGAIN: /* Wedged */
1064 case -ERESTARTSYS: /* Signal */
1065 return (int)end;
1066 case 0: /* Timeout */
1067 return -ETIME;
1068 default: /* Completed */
1069 WARN_ON(end < 0); /* We're not aware of other errors */
1070 return 0;
1071 }
1072} 1092}
1073 1093
1074/** 1094/**
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 0a49b651e510..da1022a328e3 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -311,6 +311,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
311 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); 311 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
312 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); 312 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
313 err_printf(m, "CCID: 0x%08x\n", error->ccid); 313 err_printf(m, "CCID: 0x%08x\n", error->ccid);
314 err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
314 315
315 for (i = 0; i < dev_priv->num_fence_regs; i++) 316 for (i = 0; i < dev_priv->num_fence_regs; i++)
316 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); 317 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 84b7efc6ee91..05c05a6a4360 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2039,10 +2039,13 @@ static void i915_hangcheck_elapsed(unsigned long data)
2039 2039
2040 if (waitqueue_active(&ring->irq_queue)) { 2040 if (waitqueue_active(&ring->irq_queue)) {
2041 /* Issue a wake-up to catch stuck h/w. */ 2041 /* Issue a wake-up to catch stuck h/w. */
2042 DRM_ERROR("Hangcheck timer elapsed... %s idle\n", 2042 if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
2043 ring->name); 2043 DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
2044 wake_up_all(&ring->irq_queue); 2044 ring->name);
2045 ring->hangcheck.score += HUNG; 2045 wake_up_all(&ring->irq_queue);
2046 }
2047 /* Safeguard against driver failure */
2048 ring->hangcheck.score += BUSY;
2046 } else 2049 } else
2047 busy = false; 2050 busy = false;
2048 } else { 2051 } else {