aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom St Denis <tom.stdenis@amd.com>2019-01-07 17:39:10 -0500
committerAlex Deucher <alexander.deucher@amd.com>2019-01-14 15:04:53 -0500
commit22d6575b8db59097655797c740bf840a616a6816 (patch)
treea8d7ac8678e9940f2ebfe9d00268e83be1cc3c36
parent4b9674e509ea4365b68c5e309c402ef6544d567a (diff)
drm/amd/amdgpu: add missing mutex lock to amdgpu_get_xgmi_hive() (v3)
v2: Move locks around in other functions so that this function can stand on its own. Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive. v3: add reset_lock Acked-by: Shaoyun.liu < Shaoyun.liu@amd.com> Signed-off-by: Tom St Denis <tom.stdenis@amd.com> Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c40
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h5
3 files changed, 32 insertions, 19 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39d5d058b2c7..1a558dc41ba6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3525 * by different nodes. No point also since the one node already executing 3525 * by different nodes. No point also since the one node already executing
3526 * reset will also reset all the other nodes in the hive. 3526 * reset will also reset all the other nodes in the hive.
3527 */ 3527 */
3528 hive = amdgpu_get_xgmi_hive(adev); 3528 hive = amdgpu_get_xgmi_hive(adev, 0);
3529 if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && 3529 if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
3530 !mutex_trylock(&hive->hive_lock)) 3530 !mutex_trylock(&hive->reset_lock))
3531 return 0; 3531 return 0;
3532 3532
3533 /* Start with adev pre asic reset first for soft reset check.*/ 3533 /* Start with adev pre asic reset first for soft reset check.*/
@@ -3606,7 +3606,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
3606 } 3606 }
3607 3607
3608 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 3608 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
3609 mutex_unlock(&hive->hive_lock); 3609 mutex_unlock(&hive->reset_lock);
3610 3610
3611 if (r) 3611 if (r)
3612 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 3612 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index ac57a8767283..dac187454b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
40 return &hive->device_list; 40 return &hive->device_list;
41} 41}
42 42
43struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) 43struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
44{ 44{
45 int i; 45 int i;
46 struct amdgpu_hive_info *tmp; 46 struct amdgpu_hive_info *tmp;
47 47
48 if (!adev->gmc.xgmi.hive_id) 48 if (!adev->gmc.xgmi.hive_id)
49 return NULL; 49 return NULL;
50
51 mutex_lock(&xgmi_mutex);
52
50 for (i = 0 ; i < hive_count; ++i) { 53 for (i = 0 ; i < hive_count; ++i) {
51 tmp = &xgmi_hives[i]; 54 tmp = &xgmi_hives[i];
52 if (tmp->hive_id == adev->gmc.xgmi.hive_id) 55 if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
56 if (lock)
57 mutex_lock(&tmp->hive_lock);
58 mutex_unlock(&xgmi_mutex);
53 return tmp; 59 return tmp;
60 }
54 } 61 }
55 if (i >= AMDGPU_MAX_XGMI_HIVE) 62 if (i >= AMDGPU_MAX_XGMI_HIVE) {
63 mutex_unlock(&xgmi_mutex);
56 return NULL; 64 return NULL;
65 }
57 66
58 /* initialize new hive if not exist */ 67 /* initialize new hive if not exist */
59 tmp = &xgmi_hives[hive_count++]; 68 tmp = &xgmi_hives[hive_count++];
60 tmp->hive_id = adev->gmc.xgmi.hive_id; 69 tmp->hive_id = adev->gmc.xgmi.hive_id;
61 INIT_LIST_HEAD(&tmp->device_list); 70 INIT_LIST_HEAD(&tmp->device_list);
62 mutex_init(&tmp->hive_lock); 71 mutex_init(&tmp->hive_lock);
72 mutex_init(&tmp->reset_lock);
73 if (lock)
74 mutex_lock(&tmp->hive_lock);
75
76 mutex_unlock(&xgmi_mutex);
63 77
64 return tmp; 78 return tmp;
65} 79}
@@ -111,8 +125,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
111 return ret; 125 return ret;
112 } 126 }
113 127
114 mutex_lock(&xgmi_mutex); 128 hive = amdgpu_get_xgmi_hive(adev, 1);
115 hive = amdgpu_get_xgmi_hive(adev);
116 if (!hive) { 129 if (!hive) {
117 ret = -EINVAL; 130 ret = -EINVAL;
118 dev_err(adev->dev, 131 dev_err(adev->dev,
@@ -147,8 +160,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
147 break; 160 break;
148 } 161 }
149 162
163 mutex_unlock(&hive->hive_lock);
150exit: 164exit:
151 mutex_unlock(&xgmi_mutex);
152 return ret; 165 return ret;
153} 166}
154 167
@@ -159,15 +172,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
159 if (!adev->gmc.xgmi.supported) 172 if (!adev->gmc.xgmi.supported)
160 return; 173 return;
161 174
162 mutex_lock(&xgmi_mutex); 175 hive = amdgpu_get_xgmi_hive(adev, 1);
163
164 hive = amdgpu_get_xgmi_hive(adev);
165 if (!hive) 176 if (!hive)
166 goto exit; 177 return;
167 178
168 if (!(hive->number_devices--)) 179 if (!(hive->number_devices--)) {
169 mutex_destroy(&hive->hive_lock); 180 mutex_destroy(&hive->hive_lock);
170 181 mutex_destroy(&hive->reset_lock);
171exit: 182 } else {
172 mutex_unlock(&xgmi_mutex); 183 mutex_unlock(&hive->hive_lock);
184 }
173} 185}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6151eb9c8ad3..14bc60664159 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -29,10 +29,11 @@ struct amdgpu_hive_info {
29 struct list_head device_list; 29 struct list_head device_list;
30 struct psp_xgmi_topology_info topology_info; 30 struct psp_xgmi_topology_info topology_info;
31 int number_devices; 31 int number_devices;
32 struct mutex hive_lock; 32 struct mutex hive_lock,
33 reset_lock;
33}; 34};
34 35
35struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev); 36struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
36int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev); 37int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
37int amdgpu_xgmi_add_device(struct amdgpu_device *adev); 38int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
38void amdgpu_xgmi_remove_device(struct amdgpu_device *adev); 39void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);