aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlex Deucher <alexander.deucher@amd.com>2019-05-28 15:17:25 -0400
committerAlex Deucher <alexander.deucher@amd.com>2019-06-05 23:18:09 -0400
commitbeff74bc6e0fa910454fecb3fdc3843b1bfdafb9 (patch)
tree1cb8bd2320c07fdae0fd9f5034524d44d9ee4cb5
parentc53e4db71276bf257b09010935a04bdafddd458e (diff)
drm/amdgpu: fix a race in GPU reset with IB test (v2)
Split late_init into two functions, one (do_late_init) which just does the hw init, and late_init which calls do_late_init and schedules the IB test work. Call do_late_init in the GPU reset code to run the init code, but not schedule the IB test code. The IB test code is called directly in the gpu reset code so no need to run the IB tests in a separate work thread. If we do, we end up racing. v2: Rework late_init. Pull out the mgpu fan boost and xgmi pstate code into late_init so they get called in all cases. rename the late_init worker thread to delayed work since it's just the IB tests now which can happen later. Schedule the work at init and resume time. It's not needed at reset time because the IB tests are called directly. Reviewed-by: Christian König <christian.koenig@amd.com> Cc: Xinhui Pan <xinhui.pan@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c116
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c2
3 files changed, 61 insertions, 59 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 58f8f132904d..d8584b74f5e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -922,7 +922,7 @@ struct amdgpu_device {
922 const struct amdgpu_df_funcs *df_funcs; 922 const struct amdgpu_df_funcs *df_funcs;
923 923
924 /* delayed work_func for deferring clockgating during resume */ 924 /* delayed work_func for deferring clockgating during resume */
925 struct delayed_work late_init_work; 925 struct delayed_work delayed_init_work;
926 926
927 struct amdgpu_virt virt; 927 struct amdgpu_virt virt;
928 /* firmware VRAM reservation */ 928 /* firmware VRAM reservation */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7a8c2201cd04..d00fd5dd307a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1869,6 +1869,43 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power
1869 return 0; 1869 return 0;
1870} 1870}
1871 1871
1872static int amdgpu_device_enable_mgpu_fan_boost(void)
1873{
1874 struct amdgpu_gpu_instance *gpu_ins;
1875 struct amdgpu_device *adev;
1876 int i, ret = 0;
1877
1878 mutex_lock(&mgpu_info.mutex);
1879
1880 /*
1881 * MGPU fan boost feature should be enabled
1882 * only when there are two or more dGPUs in
1883 * the system
1884 */
1885 if (mgpu_info.num_dgpu < 2)
1886 goto out;
1887
1888 for (i = 0; i < mgpu_info.num_dgpu; i++) {
1889 gpu_ins = &(mgpu_info.gpu_ins[i]);
1890 adev = gpu_ins->adev;
1891 if (!(adev->flags & AMD_IS_APU) &&
1892 !gpu_ins->mgpu_fan_enabled &&
1893 adev->powerplay.pp_funcs &&
1894 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
1895 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
1896 if (ret)
1897 break;
1898
1899 gpu_ins->mgpu_fan_enabled = 1;
1900 }
1901 }
1902
1903out:
1904 mutex_unlock(&mgpu_info.mutex);
1905
1906 return ret;
1907}
1908
1872/** 1909/**
1873 * amdgpu_device_ip_late_init - run late init for hardware IPs 1910 * amdgpu_device_ip_late_init - run late init for hardware IPs
1874 * 1911 *
@@ -1902,11 +1939,15 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
1902 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 1939 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
1903 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 1940 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
1904 1941
1905 queue_delayed_work(system_wq, &adev->late_init_work,
1906 msecs_to_jiffies(AMDGPU_RESUME_MS));
1907
1908 amdgpu_device_fill_reset_magic(adev); 1942 amdgpu_device_fill_reset_magic(adev);
1909 1943
1944 r = amdgpu_device_enable_mgpu_fan_boost();
1945 if (r)
1946 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
1947
1948 /* set to low pstate by default */
1949 amdgpu_xgmi_set_pstate(adev, 0);
1950
1910 return 0; 1951 return 0;
1911} 1952}
1912 1953
@@ -2005,65 +2046,20 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2005 return 0; 2046 return 0;
2006} 2047}
2007 2048
2008static int amdgpu_device_enable_mgpu_fan_boost(void)
2009{
2010 struct amdgpu_gpu_instance *gpu_ins;
2011 struct amdgpu_device *adev;
2012 int i, ret = 0;
2013
2014 mutex_lock(&mgpu_info.mutex);
2015
2016 /*
2017 * MGPU fan boost feature should be enabled
2018 * only when there are two or more dGPUs in
2019 * the system
2020 */
2021 if (mgpu_info.num_dgpu < 2)
2022 goto out;
2023
2024 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2025 gpu_ins = &(mgpu_info.gpu_ins[i]);
2026 adev = gpu_ins->adev;
2027 if (!(adev->flags & AMD_IS_APU) &&
2028 !gpu_ins->mgpu_fan_enabled &&
2029 adev->powerplay.pp_funcs &&
2030 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2031 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2032 if (ret)
2033 break;
2034
2035 gpu_ins->mgpu_fan_enabled = 1;
2036 }
2037 }
2038
2039out:
2040 mutex_unlock(&mgpu_info.mutex);
2041
2042 return ret;
2043}
2044
2045/** 2049/**
2046 * amdgpu_device_ip_late_init_func_handler - work handler for ib test 2050 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2047 * 2051 *
2048 * @work: work_struct. 2052 * @work: work_struct.
2049 */ 2053 */
2050static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work) 2054static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2051{ 2055{
2052 struct amdgpu_device *adev = 2056 struct amdgpu_device *adev =
2053 container_of(work, struct amdgpu_device, late_init_work.work); 2057 container_of(work, struct amdgpu_device, delayed_init_work.work);
2054 int r; 2058 int r;
2055 2059
2056 r = amdgpu_ib_ring_tests(adev); 2060 r = amdgpu_ib_ring_tests(adev);
2057 if (r) 2061 if (r)
2058 DRM_ERROR("ib ring test failed (%d).\n", r); 2062 DRM_ERROR("ib ring test failed (%d).\n", r);
2059
2060 r = amdgpu_device_enable_mgpu_fan_boost();
2061 if (r)
2062 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2063
2064 /*set to low pstate by default */
2065 amdgpu_xgmi_set_pstate(adev, 0);
2066
2067} 2063}
2068 2064
2069static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2065static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
@@ -2535,8 +2531,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
2535 INIT_LIST_HEAD(&adev->ring_lru_list); 2531 INIT_LIST_HEAD(&adev->ring_lru_list);
2536 spin_lock_init(&adev->ring_lru_list_lock); 2532 spin_lock_init(&adev->ring_lru_list_lock);
2537 2533
2538 INIT_DELAYED_WORK(&adev->late_init_work, 2534 INIT_DELAYED_WORK(&adev->delayed_init_work,
2539 amdgpu_device_ip_late_init_func_handler); 2535 amdgpu_device_delayed_init_work_handler);
2540 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 2536 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
2541 amdgpu_device_delay_enable_gfx_off); 2537 amdgpu_device_delay_enable_gfx_off);
2542 2538
@@ -2749,6 +2745,9 @@ fence_driver_init:
2749 /* must succeed. */ 2745 /* must succeed. */
2750 amdgpu_ras_resume(adev); 2746 amdgpu_ras_resume(adev);
2751 2747
2748 queue_delayed_work(system_wq, &adev->delayed_init_work,
2749 msecs_to_jiffies(AMDGPU_RESUME_MS));
2750
2752 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); 2751 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
2753 if (r) { 2752 if (r) {
2754 dev_err(adev->dev, "Could not create pcie_replay_count"); 2753 dev_err(adev->dev, "Could not create pcie_replay_count");
@@ -2796,7 +2795,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
2796 adev->firmware.gpu_info_fw = NULL; 2795 adev->firmware.gpu_info_fw = NULL;
2797 } 2796 }
2798 adev->accel_working = false; 2797 adev->accel_working = false;
2799 cancel_delayed_work_sync(&adev->late_init_work); 2798 cancel_delayed_work_sync(&adev->delayed_init_work);
2800 /* free i2c buses */ 2799 /* free i2c buses */
2801 if (!amdgpu_device_has_dc_support(adev)) 2800 if (!amdgpu_device_has_dc_support(adev))
2802 amdgpu_i2c_fini(adev); 2801 amdgpu_i2c_fini(adev);
@@ -2859,7 +2858,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
2859 if (fbcon) 2858 if (fbcon)
2860 amdgpu_fbdev_set_suspend(adev, 1); 2859 amdgpu_fbdev_set_suspend(adev, 1);
2861 2860
2862 cancel_delayed_work_sync(&adev->late_init_work); 2861 cancel_delayed_work_sync(&adev->delayed_init_work);
2863 2862
2864 if (!amdgpu_device_has_dc_support(adev)) { 2863 if (!amdgpu_device_has_dc_support(adev)) {
2865 /* turn off display hw */ 2864 /* turn off display hw */
@@ -2979,6 +2978,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
2979 if (r) 2978 if (r)
2980 return r; 2979 return r;
2981 2980
2981 queue_delayed_work(system_wq, &adev->delayed_init_work,
2982 msecs_to_jiffies(AMDGPU_RESUME_MS));
2983
2982 if (!amdgpu_device_has_dc_support(adev)) { 2984 if (!amdgpu_device_has_dc_support(adev)) {
2983 /* pin cursors */ 2985 /* pin cursors */
2984 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 2986 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
@@ -3002,7 +3004,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
3002 return r; 3004 return r;
3003 3005
3004 /* Make sure IB tests flushed */ 3006 /* Make sure IB tests flushed */
3005 flush_delayed_work(&adev->late_init_work); 3007 flush_delayed_work(&adev->delayed_init_work);
3006 3008
3007 /* blat the mode back in */ 3009 /* blat the mode back in */
3008 if (fbcon) { 3010 if (fbcon) {
@@ -3593,7 +3595,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3593 3595
3594 dev_info(adev->dev, "GPU reset begin!\n"); 3596 dev_info(adev->dev, "GPU reset begin!\n");
3595 3597
3596 cancel_delayed_work_sync(&adev->late_init_work); 3598 cancel_delayed_work_sync(&adev->delayed_init_work);
3597 3599
3598 hive = amdgpu_get_xgmi_hive(adev, false); 3600 hive = amdgpu_get_xgmi_hive(adev, false);
3599 3601
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index edb675103bd4..0f7cc98961d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -974,7 +974,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
974 int r, pasid; 974 int r, pasid;
975 975
976 /* Ensure IB tests are run on ring */ 976 /* Ensure IB tests are run on ring */
977 flush_delayed_work(&adev->late_init_work); 977 flush_delayed_work(&adev->delayed_init_work);
978 978
979 file_priv->driver_priv = NULL; 979 file_priv->driver_priv = NULL;
980 980