aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
diff options
context:
space:
mode:
authorMonk Liu <Monk.Liu@amd.com>2018-01-15 00:44:30 -0500
committerAlex Deucher <alexander.deucher@amd.com>2018-03-14 15:38:27 -0400
commit48527e5296edc7b952fb2c1c40fd8c388cc935ed (patch)
treeccc065f565a23fb6c28113a8abde6033457c2b41 /drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
parent421a2a30c121660c4628e4494dcca1fceab8a4be (diff)
drm/amdgpu: refactoring mailbox to fix TDR handshake bugs(v2)
this patch actually refactor mailbox implmentations, and all below changes are needed together to fix all those mailbox handshake issues exposured by heavey TDR test. 1)refactor all mailbox functions based on byte accessing for mb_control reason is to avoid touching non-related bits when writing trn/rcv part of mailbox_control, this way some incorrect INTR sent to hypervisor side could be avoided, and it fixes couple handshake bug. 2)trans_msg function re-impled: put a invalid logic before transmitting message to make sure the ACK bit is in a clear status, otherwise there is chance that ACK asserted already before transmitting message and lead to fake ACK polling. (hypervisor side have some tricks to workaround ACK bit being corrupted by VF FLR which hase an side effects that may make guest side ACK bit asserted wrongly), and clear TRANS_MSG words after message transferred. 3)for mailbox_flr_work, it is also re-worked: it takes the mutex lock first if invoked, to block gpu recover's participate too early while hypervisor side is doing VF FLR. (hypervisor sends FLR_NOTIFY to guest before doing VF FLR and sentds FLR_COMPLETE after VF FLR done, and the FLR_NOTIFY will trigger interrupt to guest which lead to mailbox_flr_work being invoked) This can avoid the issue that mailbox trans msg being cleared by its VF FLR. 4)for mailbox_rcv_irq IRQ routine, it should only peek msg and schedule mailbox_flr_work, instead of ACK to hypervisor itself, because FLR_NOTIFY msg sent from hypervisor side doesn't need VF's ACK (this is because VF's ACK would lead to hypervisor clear its trans_valid/msg, and this would cause handshake bug if trans_valid/msg is cleared not due to correct VF ACK but from a wrong VF ACK like this "FLR_NOTIFY" one) This fixed handshake bug that sometimes GUEST always couldn't receive "READY_TO_ACCESS_GPU" msg from hypervisor. 5)seperate polling time limite accordingly: POLL ACK cost no more than 500ms POLL MSG cost no more than 12000ms POLL FLR finish cost no more than 500ms 6) we still need to set adev into in_gpu_reset mode after we received FLR_NOTIFY from host side, this can prevent innocent app wrongly succesed to open amdgpu dri device. FLR_NOFITY is received due to an IDLE hang detected from hypervisor side which indicating GPU is already die in this VF. v2: use MACRO as the offset of mailbox_control register don't test if NOTIFY_CMPL event in rcv_msg since it won't recieve that message anymore Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Pixel Ding <Pixel.Ding@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c196
1 files changed, 103 insertions, 93 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 271452d3999a..8b47484e169a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -33,56 +33,34 @@
33 33
34static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev) 34static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
35{ 35{
36 u32 reg; 36 WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
37 int timeout = AI_MAILBOX_TIMEDOUT;
38 u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_VALID);
39
40 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
41 mmBIF_BX_PF0_MAILBOX_CONTROL));
42 reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_ACK, 1);
43 WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
44 mmBIF_BX_PF0_MAILBOX_CONTROL), reg);
45
46 /*Wait for RCV_MSG_VALID to be 0*/
47 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
48 mmBIF_BX_PF0_MAILBOX_CONTROL));
49 while (reg & mask) {
50 if (timeout <= 0) {
51 pr_err("RCV_MSG_VALID is not cleared\n");
52 break;
53 }
54 mdelay(1);
55 timeout -=1;
56
57 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
58 mmBIF_BX_PF0_MAILBOX_CONTROL));
59 }
60} 37}
61 38
62static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val) 39static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val)
63{ 40{
64 u32 reg; 41 WREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE, val ? 1 : 0);
42}
65 43
66 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, 44/*
67 mmBIF_BX_PF0_MAILBOX_CONTROL)); 45 * this peek_msg could *only* be called in IRQ routine becuase in IRQ routine
68 reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_CONTROL, 46 * RCV_MSG_VALID filed of BIF_BX_PF0_MAILBOX_CONTROL must already be set to 1
69 TRN_MSG_VALID, val ? 1 : 0); 47 * by host.
70 WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_CONTROL), 48 *
71 reg); 49 * if called no in IRQ routine, this peek_msg cannot guaranteed to return the
50 * correct value since it doesn't return the RCV_DW0 under the case that
51 * RCV_MSG_VALID is set by host.
52 */
53static enum idh_event xgpu_ai_mailbox_peek_msg(struct amdgpu_device *adev)
54{
55 return RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
56 mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));
72} 57}
73 58
59
74static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev, 60static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
75 enum idh_event event) 61 enum idh_event event)
76{ 62{
77 u32 reg; 63 u32 reg;
78 u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_VALID);
79
80 if (event != IDH_FLR_NOTIFICATION_CMPL) {
81 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
82 mmBIF_BX_PF0_MAILBOX_CONTROL));
83 if (!(reg & mask))
84 return -ENOENT;
85 }
86 64
87 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, 65 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
88 mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0)); 66 mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));
@@ -94,54 +72,67 @@ static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
94 return 0; 72 return 0;
95} 73}
96 74
75static uint8_t xgpu_ai_peek_ack(struct amdgpu_device *adev) {
76 return RREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE) & 2;
77}
78
97static int xgpu_ai_poll_ack(struct amdgpu_device *adev) 79static int xgpu_ai_poll_ack(struct amdgpu_device *adev)
98{ 80{
99 int r = 0, timeout = AI_MAILBOX_TIMEDOUT; 81 int timeout = AI_MAILBOX_POLL_ACK_TIMEDOUT;
100 u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, TRN_MSG_ACK); 82 u8 reg;
101 u32 reg; 83
84 do {
85 reg = RREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE);
86 if (reg & 2)
87 return 0;
102 88
103 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
104 mmBIF_BX_PF0_MAILBOX_CONTROL));
105 while (!(reg & mask)) {
106 if (timeout <= 0) {
107 pr_err("Doesn't get ack from pf.\n");
108 r = -ETIME;
109 break;
110 }
111 mdelay(5); 89 mdelay(5);
112 timeout -= 5; 90 timeout -= 5;
91 } while (timeout > 1);
113 92
114 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, 93 pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", AI_MAILBOX_POLL_ACK_TIMEDOUT);
115 mmBIF_BX_PF0_MAILBOX_CONTROL));
116 }
117 94
118 return r; 95 return -ETIME;
119} 96}
120 97
121static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event) 98static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event)
122{ 99{
123 int r = 0, timeout = AI_MAILBOX_TIMEDOUT; 100 int r, timeout = AI_MAILBOX_POLL_MSG_TIMEDOUT;
124
125 r = xgpu_ai_mailbox_rcv_msg(adev, event);
126 while (r) {
127 if (timeout <= 0) {
128 pr_err("Doesn't get msg:%d from pf.\n", event);
129 r = -ETIME;
130 break;
131 }
132 mdelay(5);
133 timeout -= 5;
134 101
102 do {
135 r = xgpu_ai_mailbox_rcv_msg(adev, event); 103 r = xgpu_ai_mailbox_rcv_msg(adev, event);
136 } 104 if (!r)
105 return 0;
137 106
138 return r; 107 msleep(10);
108 timeout -= 10;
109 } while (timeout > 1);
110
111 pr_err("Doesn't get msg:%d from pf, error=%d\n", event, r);
112
113 return -ETIME;
139} 114}
140 115
141static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev, 116static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev,
142 enum idh_request req, u32 data1, u32 data2, u32 data3) { 117 enum idh_request req, u32 data1, u32 data2, u32 data3) {
143 u32 reg; 118 u32 reg;
144 int r; 119 int r;
120 uint8_t trn;
121
122 /* IMPORTANT:
123 * clear TRN_MSG_VALID valid to clear host's RCV_MSG_ACK
124 * and with host's RCV_MSG_ACK cleared hw automatically clear host's RCV_MSG_ACK
125 * which lead to VF's TRN_MSG_ACK cleared, otherwise below xgpu_ai_poll_ack()
126 * will return immediatly
127 */
128 do {
129 xgpu_ai_mailbox_set_valid(adev, false);
130 trn = xgpu_ai_peek_ack(adev);
131 if (trn) {
132 pr_err("trn=%x ACK should not asssert! wait again !\n", trn);
133 msleep(1);
134 }
135 } while(trn);
145 136
146 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, 137 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
147 mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0)); 138 mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));
@@ -245,15 +236,36 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
245{ 236{
246 struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); 237 struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
247 struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); 238 struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
248 239 int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
249 /* wait until RCV_MSG become 3 */ 240 int locked;
250 if (xgpu_ai_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL)) { 241
251 pr_err("failed to recieve FLR_CMPL\n"); 242 /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
252 return; 243 * otherwise the mailbox msg will be ruined/reseted by
253 } 244 * the VF FLR.
254 245 *
255 /* Trigger recovery due to world switch failure */ 246 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
256 amdgpu_device_gpu_recover(adev, NULL, false); 247 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
248 * which means host side had finished this VF's FLR.
249 */
250 locked = mutex_trylock(&adev->lock_reset);
251 if (locked)
252 adev->in_gpu_reset = 1;
253
254 do {
255 if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
256 goto flr_done;
257
258 msleep(10);
259 timeout -= 10;
260 } while (timeout > 1);
261
262flr_done:
263 if (locked)
264 mutex_unlock(&adev->lock_reset);
265
266 /* Trigger recovery for world switch failure if no TDR */
267 if (amdgpu_lockup_timeout == 0)
268 amdgpu_device_gpu_recover(adev, NULL, true);
257} 269}
258 270
259static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, 271static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -274,24 +286,22 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
274 struct amdgpu_irq_src *source, 286 struct amdgpu_irq_src *source,
275 struct amdgpu_iv_entry *entry) 287 struct amdgpu_iv_entry *entry)
276{ 288{
277 int r; 289 enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);
278 290
279 /* trigger gpu-reset by hypervisor only if TDR disbaled */ 291 switch (event) {
280 if (!amdgpu_gpu_recovery) { 292 case IDH_FLR_NOTIFICATION:
281 /* see what event we get */ 293 if (amdgpu_sriov_runtime(adev))
282 r = xgpu_ai_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION); 294 schedule_work(&adev->virt.flr_work);
283 295 break;
284 /* sometimes the interrupt is delayed to inject to VM, so under such case 296 /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
285 * the IDH_FLR_NOTIFICATION is overwritten by VF FLR from GIM side, thus 297 * it byfar since that polling thread will handle it,
286 * above recieve message could be failed, we should schedule the flr_work 298 * other msg like flr complete is not handled here.
287 * anyway
288 */ 299 */
289 if (r) { 300 case IDH_CLR_MSG_BUF:
290 DRM_ERROR("FLR_NOTIFICATION is missed\n"); 301 case IDH_FLR_NOTIFICATION_CMPL:
291 xgpu_ai_mailbox_send_ack(adev); 302 case IDH_READY_TO_ACCESS_GPU:
292 } 303 default:
293 304 break;
294 schedule_work(&adev->virt.flr_work);
295 } 305 }
296 306
297 return 0; 307 return 0;